diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index d9bfca763819f..df23c70e25514 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -1307,6 +1307,25 @@ class ScalarEvolution { LoopGuards(ScalarEvolution &SE) : SE(SE) {} + /// Recursively collect loop guards in \p Guards, starting from + /// block \p Block with predecessor \p Pred. The intended starting point + /// is to collect from a loop header and its predecessor. + static void + collectFromBlock(ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards, + const BasicBlock *Block, const BasicBlock *Pred, + SmallPtrSetImpl &VisitedBlocks, + unsigned Depth = 0); + + /// Collect loop guards in \p Guards, starting from PHINode \p + /// Phi, by calling \p collectFromBlock on the incoming blocks of + /// \Phi and trying to merge the found constraints into a single + /// combined one for \p Phi. + static void collectFromPHI( + ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards, + const PHINode &Phi, SmallPtrSetImpl &VisitedBlocks, + SmallDenseMap &IncomingGuards, + unsigned Depth); + public: /// Collect rewrite map for loop guards for loop \p L, together with flags /// indicating if NUW and NSW can be preserved during rewriting. diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 51cffac808768..f4fd6c4f46a7d 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -222,6 +222,10 @@ static cl::opt RangeIterThreshold( cl::desc("Threshold for switching to iteratively computing SCEV ranges"), cl::init(32)); +static cl::opt MaxLoopGuardCollectionDepth( + "scalar-evolution-max-loop-guard-collection-depth", cl::Hidden, + cl::desc("Maximum depth for recrusive loop guard collection"), cl::init(1)); + static cl::opt ClassifyExpressions("scalar-evolution-classify-expressions", cl::Hidden, cl::init(true), @@ -10608,7 +10612,7 @@ ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB) if (const Loop *L = LI.getLoopFor(BB)) return {L->getLoopPredecessor(), L->getHeader()}; - return {nullptr, nullptr}; + return {nullptr, BB}; } /// SCEV structural equivalence is usually sufficient for testing whether two @@ -15089,7 +15093,81 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS, ScalarEvolution::LoopGuards ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { + BasicBlock *Header = L->getHeader(); + BasicBlock *Pred = L->getLoopPredecessor(); LoopGuards Guards(SE); + SmallPtrSet VisitedBlocks; + collectFromBlock(SE, Guards, Header, Pred, VisitedBlocks); + return Guards; +} + +void ScalarEvolution::LoopGuards::collectFromPHI( + ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards, + const PHINode &Phi, SmallPtrSetImpl &VisitedBlocks, + SmallDenseMap &IncomingGuards, + unsigned Depth) { + if (!SE.isSCEVable(Phi.getType())) + return; + + using MinMaxPattern = std::pair; + auto GetMinMaxConst = [&](unsigned IncomingIdx) -> MinMaxPattern { + const BasicBlock *InBlock = Phi.getIncomingBlock(IncomingIdx); + if (!VisitedBlocks.insert(InBlock).second) + return {nullptr, scCouldNotCompute}; + auto [G, Inserted] = IncomingGuards.try_emplace(InBlock, LoopGuards(SE)); + if (Inserted) + collectFromBlock(SE, G->second, Phi.getParent(), InBlock, VisitedBlocks, + Depth + 1); + auto &RewriteMap = G->second.RewriteMap; + if (RewriteMap.empty()) + return {nullptr, scCouldNotCompute}; + auto S = RewriteMap.find(SE.getSCEV(Phi.getIncomingValue(IncomingIdx))); + if (S == RewriteMap.end()) + return {nullptr, scCouldNotCompute}; + auto *SM = dyn_cast_if_present(S->second); + if (!SM) + return {nullptr, scCouldNotCompute}; + if (const SCEVConstant *C0 = dyn_cast(SM->getOperand(0))) + return {C0, SM->getSCEVType()}; + return {nullptr, scCouldNotCompute}; + }; + auto MergeMinMaxConst = [](MinMaxPattern P1, + MinMaxPattern P2) -> MinMaxPattern { + auto [C1, T1] = P1; + auto [C2, T2] = P2; + if (!C1 || !C2 || T1 != T2) + return {nullptr, scCouldNotCompute}; + switch (T1) { + case scUMaxExpr: + return {C1->getAPInt().ult(C2->getAPInt()) ? C1 : C2, T1}; + case scSMaxExpr: + return {C1->getAPInt().slt(C2->getAPInt()) ? C1 : C2, T1}; + case scUMinExpr: + return {C1->getAPInt().ugt(C2->getAPInt()) ? C1 : C2, T1}; + case scSMinExpr: + return {C1->getAPInt().sgt(C2->getAPInt()) ? C1 : C2, T1}; + default: + llvm_unreachable("Trying to merge non-MinMaxExpr SCEVs."); + } + }; + auto P = GetMinMaxConst(0); + for (unsigned int In = 1; In < Phi.getNumIncomingValues(); In++) { + if (!P.first) + break; + P = MergeMinMaxConst(P, GetMinMaxConst(In)); + } + if (P.first) { + const SCEV *LHS = SE.getSCEV(const_cast(&Phi)); + SmallVector Ops({P.first, LHS}); + const SCEV *RHS = SE.getMinMaxExpr(P.second, Ops); + Guards.RewriteMap.insert({LHS, RHS}); + } +} + +void ScalarEvolution::LoopGuards::collectFromBlock( + ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards, + const BasicBlock *Block, const BasicBlock *Pred, + SmallPtrSetImpl &VisitedBlocks, unsigned Depth) { SmallVector ExprsToRewrite; auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS, @@ -15428,14 +15506,13 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { } }; - BasicBlock *Header = L->getHeader(); SmallVector> Terms; // First, collect information from assumptions dominating the loop. for (auto &AssumeVH : SE.AC.assumptions()) { if (!AssumeVH) continue; auto *AssumeI = cast(AssumeVH); - if (!SE.DT.dominates(AssumeI, Header)) + if (!SE.DT.dominates(AssumeI, Block)) continue; Terms.emplace_back(AssumeI->getOperand(0), true); } @@ -15446,8 +15523,8 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { if (GuardDecl) for (const auto *GU : GuardDecl->users()) if (const auto *Guard = dyn_cast(GU)) - if (Guard->getFunction() == Header->getParent() && - SE.DT.dominates(Guard, Header)) + if (Guard->getFunction() == Block->getParent() && + SE.DT.dominates(Guard, Block)) Terms.emplace_back(Guard->getArgOperand(0), true); // Third, collect conditions from dominating branches. Starting at the loop @@ -15455,11 +15532,10 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { // predecessors that can be found that have unique successors leading to the // original header. // TODO: share this logic with isLoopEntryGuardedByCond. - for (std::pair Pair( - L->getLoopPredecessor(), Header); - Pair.first; + std::pair Pair(Pred, Block); + for (; Pair.first; Pair = SE.getPredecessorWithUniqueSuccessorForBB(Pair.first)) { - + VisitedBlocks.insert(Pair.second); const BranchInst *LoopEntryPredicate = dyn_cast(Pair.first->getTerminator()); if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional()) @@ -15467,6 +15543,22 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { Terms.emplace_back(LoopEntryPredicate->getCondition(), LoopEntryPredicate->getSuccessor(0) == Pair.second); + + // If we are recursively collecting guards stop after 2 + // predecessors to limit compile-time impact for now. + if (Depth > 0 && Terms.size() == 2) + break; + } + // Finally, if we stopped climbing the predecessor chain because + // there wasn't a unique one to continue, try to collect conditions + // for PHINodes by recursively following all of their incoming + // blocks and try to merge the found conditions to build a new one + // for the Phi. + if (Pair.second->hasNPredecessorsOrMore(2) && + Depth < MaxLoopGuardCollectionDepth) { + SmallDenseMap IncomingGuards; + for (auto &Phi : Pair.second->phis()) + collectFromPHI(SE, Guards, Phi, VisitedBlocks, IncomingGuards, Depth); } // Now apply the information from the collected conditions to @@ -15523,7 +15615,6 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { Guards.RewriteMap.insert({Expr, Guards.rewrite(RewriteTo)}); } } - return Guards; } const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index c63cf0c37f2f9..f8728ad50cf49 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -457,6 +457,12 @@ class LoopVectorizationPlanner { bool isMoreProfitable(const VectorizationFactor &A, const VectorizationFactor &B) const; + /// Returns true if the per-lane cost of VectorizationFactor A is lower than + /// that of B in the context of vectorizing a loop with known \p MaxTripCount. + bool isMoreProfitable(const VectorizationFactor &A, + const VectorizationFactor &B, + const unsigned MaxTripCount) const; + /// Determines if we have the infrastructure to vectorize the loop and its /// epilogue, assuming the main loop is vectorized by \p VF. bool isCandidateForEpilogueVectorization(const ElementCount VF) const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 68363abdb817a..df1e7d0fc7b0f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1554,7 +1554,10 @@ class LoopVectorizationCostModel { /// Returns true if epilogue vectorization is considered profitable, and /// false otherwise. /// \p VF is the vectorization factor chosen for the original loop. - bool isEpilogueVectorizationProfitable(const ElementCount VF) const; + /// \p Multiplier is an aditional scaling factor applied to VF before + /// comparing to EpilogueVectorizationMinVF. + bool isEpilogueVectorizationProfitable(const ElementCount VF, + const unsigned Multiplier) const; /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. @@ -4293,12 +4296,11 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { } bool LoopVectorizationPlanner::isMoreProfitable( - const VectorizationFactor &A, const VectorizationFactor &B) const { + const VectorizationFactor &A, const VectorizationFactor &B, + const unsigned MaxTripCount) const { InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; - unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); - // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); @@ -4347,6 +4349,13 @@ bool LoopVectorizationPlanner::isMoreProfitable( return CmpFn(RTCostA, RTCostB); } +bool LoopVectorizationPlanner::isMoreProfitable( + const VectorizationFactor &A, const VectorizationFactor &B) const { + const unsigned MaxTripCount = + PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); + return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount); +} + static void emitInvalidCostRemarks(SmallVector InvalidCosts, OptimizationRemarkEmitter *ORE, Loop *TheLoop) { @@ -4626,7 +4635,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( } bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( - const ElementCount VF) const { + const ElementCount VF, const unsigned Multiplier) const { // FIXME: We need a much better cost-model to take different parameters such // as register pressure, code size increase and cost of extra branches into // account. For now we apply a very crude heuristic and only consider loops @@ -4641,9 +4650,6 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( if (TTI.getMaxInterleaveFactor(VF) <= 1) return false; - unsigned Multiplier = 1; - if (VF.isScalable()) - Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) return true; return false; @@ -4690,7 +4696,11 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( return Result; } - if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { + unsigned Multiplier = IC; + if (MainLoopVF.isScalable()) + Multiplier = getVScaleForTuning(OrigLoop, TTI).value_or(1); + + if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, Multiplier)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); return Result; @@ -4709,16 +4719,20 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( ScalarEvolution &SE = *PSE.getSE(); Type *TCType = Legal->getWidestInductionType(); const SCEV *RemainingIterations = nullptr; + unsigned MaxTripCount = 0; for (auto &NextVF : ProfitableVFs) { // Skip candidate VFs without a corresponding VPlan. if (!hasPlanWithVF(NextVF.Width)) continue; - // Skip candidate VFs with widths >= the estimate runtime VF (scalable - // vectors) or the VF of the main loop (fixed vectors). + // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable + // vectors) or > the VF of the main loop (fixed vectors). if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || - ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) + (NextVF.Width.isScalable() && + ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) || + (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() && + ElementCount::isKnownGT(NextVF.Width, MainLoopVF))) continue; // If NextVF is greater than the number of remaining iterations, the @@ -4729,6 +4743,14 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop); RemainingIterations = SE.getURemExpr( TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); + MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1; + if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, + SE.getConstant(TCType, MaxTripCount))) { + MaxTripCount = + SE.getUnsignedRangeMax(RemainingIterations).getZExtValue(); + } + LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: " + << MaxTripCount << "\n"); } if (SE.isKnownPredicate( CmpInst::ICMP_UGT, @@ -4737,7 +4759,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( continue; } - if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) + if (Result.Width.isScalar() || + isMoreProfitable(NextVF, Result, MaxTripCount)) Result = NextVF; } diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll new file mode 100644 index 0000000000000..71d66ef04ade1 --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll @@ -0,0 +1,279 @@ +; RUN: opt < %s -disable-output "-passes=print" -scalar-evolution-max-iterations=0 -scalar-evolution-classify-expressions=0 2>&1 | FileCheck %s + +define void @slt(i16 %a, i16 %b, i1 %c) { +; CHECK-LABEL: 'slt' +; CHECK-NEXT: Determining loop execution counts for: @slt +; CHECK-NEXT: Loop %loop: backedge-taken count is (19 + (-1 * %count)) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 18 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (19 + (-1 * %count)) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %b2 + +b1: + %cmp1 = icmp slt i16 %a, 1 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp slt i16 %b, 4 + br i1 %cmp2, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ] + %cmp3 = icmp sle i16 %count, 19 + br i1 %cmp3, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, 1 + %exitcond = icmp eq i16 %iv.next, 20 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @ult(i16 %a, i16 %b, i1 %c) { +; CHECK-LABEL: 'ult' +; CHECK-NEXT: Determining loop execution counts for: @ult +; CHECK-NEXT: Loop %loop: backedge-taken count is (21 + (-1 * %count)) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 19 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (21 + (-1 * %count)) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %b2 + +b1: + %cmp1 = icmp ult i16 %a, 2 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp ult i16 %b, 5 + br i1 %cmp2, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ] + %cmp3 = icmp ule i16 %count, 20 + br i1 %cmp3, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, 1 + %exitcond = icmp eq i16 %iv.next, 22 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @sgt(i16 %a, i16 %b, i1 %c) { +; CHECK-LABEL: 'sgt' +; CHECK-NEXT: Determining loop execution counts for: @sgt +; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 9 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %b2 + +b1: + %cmp1 = icmp sgt i16 %a, 10 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp sgt i16 %b, 8 + br i1 %cmp2, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ] + %cmp3 = icmp sge i16 %count, 1 + br i1 %cmp3, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, -1 + %exitcond = icmp eq i16 %iv.next, 0 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @ugt(i16 %a, i16 %b, i1 %c) { +; CHECK-LABEL: 'ugt' +; CHECK-NEXT: Determining loop execution counts for: @ugt +; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 10 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %b2 + +b1: + %cmp1 = icmp ugt i16 %a, 11 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp ugt i16 %b, 7 + br i1 %cmp2, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ] + %cmp3 = icmp ne i16 %count, 0 + br i1 %cmp3, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, -1 + %exitcond = icmp eq i16 %iv.next, 0 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @three_incoming(i16 %a, i16 %b, i1 %c, i1 %d) { +; CHECK-LABEL: 'three_incoming' +; CHECK-NEXT: Determining loop execution counts for: @three_incoming +; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 11 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %entry2 + +entry2: + br i1 %d, label %b2, label %b3 + +b1: + %cmp1 = icmp ugt i16 %a, 10 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp ugt i16 %b, 8 + br i1 %cmp2, label %exit, label %preheader + +b3: + %cmp3 = icmp ugt i16 %b, 12 + br i1 %cmp3, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ], [ %b, %b3 ] + %cmp4 = icmp ne i16 %count, 0 + br i1 %cmp4, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, -1 + %exitcond = icmp eq i16 %iv.next, 0 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @mixed(i16 %a, i16 %b, i1 %c) { +; CHECK-LABEL: 'mixed' +; CHECK-NEXT: Determining loop execution counts for: @mixed +; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 -2 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %b2 + +b1: + %cmp1 = icmp ugt i16 %a, 10 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp sgt i16 %b, 8 + br i1 %cmp2, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ] + %cmp3 = icmp ne i16 %count, 0 + br i1 %cmp3, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, -1 + %exitcond = icmp eq i16 %iv.next, 0 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @one_constant(i16 %a, i16 %b, i1 %c, i16 %d) { +; CHECK-LABEL: 'one_constant' +; CHECK-NEXT: Determining loop execution counts for: @one_constant +; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 -2 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %b2 + +b1: + %cmp1 = icmp ugt i16 %a, 10 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp ugt i16 %b, %d + br i1 %cmp2, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ] + %cmp3 = icmp ne i16 %count, 0 + br i1 %cmp3, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, -1 + %exitcond = icmp eq i16 %iv.next, 0 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @epilogue(i64 %count) { +; CHECK-LABEL: 'epilogue' +; CHECK-NEXT: Determining loop execution counts for: @epilogue +; CHECK-NEXT: Loop %epilogue: backedge-taken count is (-1 + %count.epilogue) +; CHECK-NEXT: Loop %epilogue: constant max backedge-taken count is i64 6 +; CHECK-NEXT: Loop %epilogue: symbolic max backedge-taken count is (-1 + %count.epilogue) +; CHECK-NEXT: Loop %epilogue: Trip multiple is 1 +; CHECK-NEXT: Loop %while.body: backedge-taken count is ((-8 + %count) /u 8) +; CHECK-NEXT: Loop %while.body: constant max backedge-taken count is i64 2305843009213693951 +; CHECK-NEXT: Loop %while.body: symbolic max backedge-taken count is ((-8 + %count) /u 8) +; CHECK-NEXT: Loop %while.body: Trip multiple is 1 +entry: + %cmp = icmp ugt i64 %count, 7 + br i1 %cmp, label %while.body, label %epilogue.preheader + +while.body: + %iv = phi i64 [ %sub, %while.body ], [ %count, %entry ] + %sub = add i64 %iv, -8 + %exitcond.not = icmp ugt i64 %sub, 7 + br i1 %exitcond.not, label %while.body, label %while.loopexit + +while.loopexit: + %sub.exit = phi i64 [ %sub, %while.body ] + br label %epilogue.preheader + +epilogue.preheader: + %count.epilogue = phi i64 [ %count, %entry ], [ %sub.exit, %while.loopexit ] + %epilogue.cmp = icmp eq i64 %count.epilogue, 0 + br i1 %epilogue.cmp, label %exit, label %epilogue + +epilogue: + %iv.epilogue = phi i64 [ %dec, %epilogue ], [ %count.epilogue, %epilogue.preheader ] + %dec = add i64 %iv.epilogue, -1 + %exitcond.epilogue = icmp eq i64 %dec, 0 + br i1 %exitcond.epilogue, label %exit, label %epilogue + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll index ed8d8e15282d5..96edfc7663813 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll @@ -16,7 +16,7 @@ define void @test_pr25490(i32 %n, ptr noalias nocapture %a, ptr noalias nocaptur ; CHECK-NEXT: br i1 [[CMP_28]], label [[FOR_COND_CLEANUP:%.*]], label [[ITER_CHECK:%.*]] ; CHECK: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[N]], 16 @@ -50,38 +50,38 @@ define void @test_pr25490(i32 %n, ptr noalias nocapture %a, ptr noalias nocaptur ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 8 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 12 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC5:%.*]] = and i64 [[TMP0]], 4294967288 +; CHECK-NEXT: [[N_VEC5:%.*]] = and i64 [[TMP0]], 4294967292 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX7]] -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i8>, ptr [[TMP14]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX7]] -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x i8>, ptr [[TMP15]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = zext <8 x i8> [[WIDE_LOAD9]] to <8 x i16> -; CHECK-NEXT: [[TMP17:%.*]] = zext <8 x i8> [[WIDE_LOAD8]] to <8 x i16> -; CHECK-NEXT: [[TMP18:%.*]] = mul nuw <8 x i16> [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = lshr <8 x i16> [[TMP18]], -; CHECK-NEXT: [[TMP20:%.*]] = trunc nuw <8 x i16> [[TMP19]] to <8 x i8> -; CHECK-NEXT: store <8 x i8> [[TMP20]], ptr [[TMP15]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX7]] -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = zext <8 x i8> [[WIDE_LOAD10]] to <8 x i16> -; CHECK-NEXT: [[TMP23:%.*]] = mul nuw <8 x i16> [[TMP22]], [[TMP17]] -; CHECK-NEXT: [[TMP24:%.*]] = lshr <8 x i16> [[TMP23]], -; CHECK-NEXT: [[TMP25:%.*]] = trunc nuw <8 x i16> [[TMP24]] to <8 x i8> -; CHECK-NEXT: store <8 x i8> [[TMP25]], ptr [[TMP21]], align 1 -; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 8 -; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]] +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX6]] +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX6]] +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = zext <4 x i8> [[WIDE_LOAD8]] to <4 x i16> +; CHECK-NEXT: [[TMP17:%.*]] = zext <4 x i8> [[WIDE_LOAD7]] to <4 x i16> +; CHECK-NEXT: [[TMP18:%.*]] = mul nuw <4 x i16> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = lshr <4 x i16> [[TMP18]], +; CHECK-NEXT: [[TMP20:%.*]] = trunc nuw <4 x i16> [[TMP19]] to <4 x i8> +; CHECK-NEXT: store <4 x i8> [[TMP20]], ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX6]] +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = zext <4 x i8> [[WIDE_LOAD9]] to <4 x i16> +; CHECK-NEXT: [[TMP23:%.*]] = mul nuw <4 x i16> [[TMP22]], [[TMP17]] +; CHECK-NEXT: [[TMP24:%.*]] = lshr <4 x i16> [[TMP23]], +; CHECK-NEXT: [[TMP25:%.*]] = trunc nuw <4 x i16> [[TMP24]] to <4 x i8> +; CHECK-NEXT: store <4 x i8> [[TMP25]], ptr [[TMP21]], align 1 +; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX6]], 4 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N_VEC5]], [[TMP0]] -; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N_VEC5]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N11]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll new file mode 100644 index 0000000000000..428ec49fa51f6 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll @@ -0,0 +1,395 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S < %s -passes=loop-vectorize -force-vector-interleave=4 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +define void @add_i8(ptr noalias nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %C, i64 noundef %Iterations) { +; CHECK-LABEL: @add_i8( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ITERATIONS:%.*]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[ITERATIONS]], 64 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ITERATIONS]], 64 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 48 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 32 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 48 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 16 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 32 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 48 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = add <16 x i8> [[WIDE_LOAD5]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP21:%.*]] = add <16 x i8> [[WIDE_LOAD6]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP22:%.*]] = add <16 x i8> [[WIDE_LOAD7]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP23:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 16 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 32 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 48 +; CHECK-NEXT: store <16 x i8> [[TMP20]], ptr [[TMP28]], align 1 +; CHECK-NEXT: store <16 x i8> [[TMP21]], ptr [[TMP29]], align 1 +; CHECK-NEXT: store <16 x i8> [[TMP22]], ptr [[TMP30]], align 1 +; CHECK-NEXT: store <16 x i8> [[TMP23]], ptr [[TMP31]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[ITERATIONS]], 8 +; CHECK-NEXT: [[N_VEC10:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF9]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[INDEX11]], 0 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, ptr [[TMP35]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP36]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i8>, ptr [[TMP37]], align 1 +; CHECK-NEXT: [[TMP38:%.*]] = add <8 x i8> [[WIDE_LOAD13]], [[WIDE_LOAD12]] +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP39]], i32 0 +; CHECK-NEXT: store <8 x i8> [[TMP38]], ptr [[TMP40]], align 1 +; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8 +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]] +; CHECK-NEXT: br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC10]] +; CHECK-NEXT: br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[IV]] +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP43]], [[TMP42]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX6]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[ITERATIONS]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %B, i64 %iv + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i8, ptr %C, i64 %iv + %1 = load i8, ptr %arrayidx2, align 1 + %add = add i8 %1, %0 + %arrayidx6 = getelementptr inbounds i8, ptr %A, i64 %iv + store i8 %add, ptr %arrayidx6, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %Iterations + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +define void @add_i16(ptr noalias nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %C, i64 noundef %Iterations) { +; CHECK-LABEL: @add_i16( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ITERATIONS:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[ITERATIONS]], 32 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ITERATIONS]], 32 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 16 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 24 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP12]], i32 8 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[TMP12]], i32 16 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP12]], i32 24 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i16>, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP19]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = add <8 x i16> [[WIDE_LOAD5]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP21:%.*]] = add <8 x i16> [[WIDE_LOAD6]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP22:%.*]] = add <8 x i16> [[WIDE_LOAD7]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP23:%.*]] = add <8 x i16> [[WIDE_LOAD8]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i16, ptr [[TMP24]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i16, ptr [[TMP24]], i32 8 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i16, ptr [[TMP24]], i32 16 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i16, ptr [[TMP24]], i32 24 +; CHECK-NEXT: store <8 x i16> [[TMP20]], ptr [[TMP28]], align 1 +; CHECK-NEXT: store <8 x i16> [[TMP21]], ptr [[TMP29]], align 1 +; CHECK-NEXT: store <8 x i16> [[TMP22]], ptr [[TMP30]], align 1 +; CHECK-NEXT: store <8 x i16> [[TMP23]], ptr [[TMP31]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[ITERATIONS]], 4 +; CHECK-NEXT: [[N_VEC10:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF9]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[INDEX11]], 0 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i16, ptr [[TMP34]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i16>, ptr [[TMP35]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i16, ptr [[TMP36]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i16>, ptr [[TMP37]], align 1 +; CHECK-NEXT: [[TMP38:%.*]] = add <4 x i16> [[WIDE_LOAD13]], [[WIDE_LOAD12]] +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i16, ptr [[TMP39]], i32 0 +; CHECK-NEXT: store <4 x i16> [[TMP38]], ptr [[TMP40]], align 1 +; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 4 +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]] +; CHECK-NEXT: br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC10]] +; CHECK-NEXT: br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP42:%.*]] = load i16, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[IV]] +; CHECK-NEXT: [[TMP43:%.*]] = load i16, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i16 [[TMP43]], [[TMP42]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX6]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[ITERATIONS]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i16, ptr %B, i64 %iv + %0 = load i16, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i16, ptr %C, i64 %iv + %1 = load i16, ptr %arrayidx2, align 1 + %add = add i16 %1, %0 + %arrayidx6 = getelementptr inbounds i16, ptr %A, i64 %iv + store i16 %add, ptr %arrayidx6, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %Iterations + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +define void @add_i32(ptr noalias nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %C, i64 noundef %Iterations) { +; CHECK-LABEL: @add_i32( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ITERATIONS:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[ITERATIONS]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ITERATIONS]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 4 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 8 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP19]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = add <4 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP21:%.*]] = add <4 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP22:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP23:%.*]] = add <4 x i32> [[WIDE_LOAD8]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 4 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 8 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 12 +; CHECK-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP28]], align 1 +; CHECK-NEXT: store <4 x i32> [[TMP21]], ptr [[TMP29]], align 1 +; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[TMP30]], align 1 +; CHECK-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP31]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[ITERATIONS]], 4 +; CHECK-NEXT: [[N_VEC10:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF9]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[INDEX11]], 0 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, ptr [[TMP35]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 1 +; CHECK-NEXT: [[TMP38:%.*]] = add <4 x i32> [[WIDE_LOAD13]], [[WIDE_LOAD12]] +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP40]], align 1 +; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 4 +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]] +; CHECK-NEXT: br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC10]] +; CHECK-NEXT: br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP43]], [[TMP42]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX6]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[ITERATIONS]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %B, i64 %iv + %0 = load i32, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i32, ptr %C, i64 %iv + %1 = load i32, ptr %arrayidx2, align 1 + %add = add i32 %1, %0 + %arrayidx6 = getelementptr inbounds i32, ptr %A, i64 %iv + store i32 %add, ptr %arrayidx6, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %Iterations + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll index 3fd957ccb84f7..9dbcb09a4bea8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll @@ -230,13 +230,15 @@ exit: define void @wide_truncated_iv(ptr %dst) { ; CHECK-LABEL: define void @wide_truncated_iv( ; CHECK-SAME: ptr [[DST:%.*]]) { -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ , [[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i8> [[VEC_IND]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 @@ -251,18 +253,41 @@ define void @wide_truncated_iv(ptr %dst) { ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 192 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 192, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 192, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 192, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i8 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[TMP7]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i8> [[DOTSPLAT]], ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX3:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VEC_IND4:%.*]] = phi <8 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX3]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: store <8 x i8> [[VEC_IND4]], ptr [[TMP10]], align 1 +; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX3]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <8 x i8> [[VEC_IND4]], +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT7]], 200 +; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT]], label [[SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ 200, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 192, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP1]] ] ; CHECK-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i8 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: store i8 [[TRUNC_IV]], ptr [[GEP]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[IV]], 200 -; CHECK-NEXT: br i1 [[C]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -549,5 +574,5 @@ exit: ; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} ; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll index 72d528d8748ba..981f27bb2736c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll @@ -10,9 +10,12 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) { ; INTERLEAVE-4-LABEL: @interleave_integer_reduction( -; INTERLEAVE-4-NEXT: entry: -; INTERLEAVE-4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 -; INTERLEAVE-4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; INTERLEAVE-4-NEXT: iter.check: +; INTERLEAVE-4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; INTERLEAVE-4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; INTERLEAVE-4: vector.main.loop.iter.check: +; INTERLEAVE-4-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16 +; INTERLEAVE-4-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; INTERLEAVE-4: vector.ph: ; INTERLEAVE-4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 ; INTERLEAVE-4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] @@ -20,9 +23,9 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) { ; INTERLEAVE-4: vector.body: ; INTERLEAVE-4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; INTERLEAVE-4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; INTERLEAVE-4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 @@ -36,38 +39,64 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) { ; INTERLEAVE-4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 ; INTERLEAVE-4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12 ; INTERLEAVE-4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 1 -; INTERLEAVE-4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1 -; INTERLEAVE-4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1 -; INTERLEAVE-4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP11]], align 1 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP11]], align 1 ; INTERLEAVE-4-NEXT: [[TMP12]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; INTERLEAVE-4-NEXT: [[TMP13]] = add <4 x i32> [[VEC_PHI1]], [[WIDE_LOAD4]] -; INTERLEAVE-4-NEXT: [[TMP14]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]] -; INTERLEAVE-4-NEXT: [[TMP15]] = add <4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]] +; INTERLEAVE-4-NEXT: [[TMP13]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]] +; INTERLEAVE-4-NEXT: [[TMP14]] = add <4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]] +; INTERLEAVE-4-NEXT: [[TMP15]] = add <4 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]] ; INTERLEAVE-4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; INTERLEAVE-4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; INTERLEAVE-4-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; INTERLEAVE-4: middle.block: ; INTERLEAVE-4-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP13]], [[TMP12]] -; INTERLEAVE-4-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP14]], [[BIN_RDX]] -; INTERLEAVE-4-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP15]], [[BIN_RDX7]] -; INTERLEAVE-4-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]]) +; INTERLEAVE-4-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP14]], [[BIN_RDX]] +; INTERLEAVE-4-NEXT: [[BIN_RDX9:%.*]] = add <4 x i32> [[TMP15]], [[BIN_RDX8]] +; INTERLEAVE-4-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX9]]) ; INTERLEAVE-4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; INTERLEAVE-4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; INTERLEAVE-4: scalar.ph: -; INTERLEAVE-4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; INTERLEAVE-4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; INTERLEAVE-4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; INTERLEAVE-4: vec.epilog.iter.check: +; INTERLEAVE-4-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] +; INTERLEAVE-4-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; INTERLEAVE-4-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; INTERLEAVE-4: vec.epilog.ph: +; INTERLEAVE-4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; INTERLEAVE-4-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; INTERLEAVE-4-NEXT: [[N_MOD_VF10:%.*]] = urem i64 [[N]], 4 +; INTERLEAVE-4-NEXT: [[N_VEC11:%.*]] = sub i64 [[N]], [[N_MOD_VF10]] +; INTERLEAVE-4-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 +; INTERLEAVE-4-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; INTERLEAVE-4: vec.epilog.vector.body: +; INTERLEAVE-4-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; INTERLEAVE-4-NEXT: [[VEC_PHI13:%.*]] = phi <4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; INTERLEAVE-4-NEXT: [[TMP19:%.*]] = add i64 [[INDEX12]], 0 +; INTERLEAVE-4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP19]] +; INTERLEAVE-4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 0 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, ptr [[TMP21]], align 1 +; INTERLEAVE-4-NEXT: [[TMP22]] = add <4 x i32> [[VEC_PHI13]], [[WIDE_LOAD14]] +; INTERLEAVE-4-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 4 +; INTERLEAVE-4-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC11]] +; INTERLEAVE-4-NEXT: br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; INTERLEAVE-4: vec.epilog.middle.block: +; INTERLEAVE-4-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP22]]) +; INTERLEAVE-4-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[N]], [[N_VEC11]] +; INTERLEAVE-4-NEXT: br i1 [[CMP_N16]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; INTERLEAVE-4: vec.epilog.scalar.ph: +; INTERLEAVE-4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; INTERLEAVE-4-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i32 [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ] ; INTERLEAVE-4-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE-4: loop: -; INTERLEAVE-4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; INTERLEAVE-4-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; INTERLEAVE-4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; INTERLEAVE-4-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX17]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] ; INTERLEAVE-4-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]] ; INTERLEAVE-4-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1 ; INTERLEAVE-4-NEXT: [[RED_NEXT]] = add i32 [[RED]], [[L]] ; INTERLEAVE-4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; INTERLEAVE-4-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; INTERLEAVE-4-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; INTERLEAVE-4-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; INTERLEAVE-4: exit: -; INTERLEAVE-4-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-4-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; INTERLEAVE-4-NEXT: ret i32 [[RED_NEXT_LCSSA]] ; ; INTERLEAVE-2-LABEL: @interleave_integer_reduction( diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll index 0f26092f510ca..a0c271b8fc521 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll @@ -16,59 +16,93 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca ; CHECK-LABEL: @saddsat( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP_NOT6]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] -; CHECK: while.body.preheader: +; CHECK-NEXT: br i1 [[CMP_NOT6]], label [[WHILE_END:%.*]], label [[ITER_CHECK:%.*]] +; CHECK: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[BLOCKSIZE]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[BLOCKSIZE]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967280 -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX3:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[OFFSET_IDX3]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 16 -; CHECK-NEXT: store <8 x i16> [[TMP6]], ptr [[NEXT_GEP6]], align 2 -; CHECK-NEXT: store <8 x i16> [[TMP7]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD6]], <8 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 16 +; CHECK-NEXT: store <8 x i16> [[TMP2]], ptr [[NEXT_GEP4]], align 2 +; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[TMP4]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP7]] +; CHECK-NEXT: [[DOTCAST9:%.*]] = trunc nuw i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST9]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 12 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_VEC8:%.*]] = and i64 [[TMP0]], 4294967292 +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC8]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[N_VEC8]], 1 +; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[N_VEC8]], 1 +; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP9]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT23:%.*]] = insertelement <4 x i16> poison, i16 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT24:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT23]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX17:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX18:%.*]] = shl i64 [[INDEX17]], 1 +; CHECK-NEXT: [[NEXT_GEP19:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[OFFSET_IDX18]] +; CHECK-NEXT: [[OFFSET_IDX20:%.*]] = shl i64 [[INDEX17]], 1 +; CHECK-NEXT: [[NEXT_GEP21:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[OFFSET_IDX20]] +; CHECK-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i16>, ptr [[NEXT_GEP19]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[WIDE_LOAD22]], <4 x i16> [[BROADCAST_SPLAT24]]) +; CHECK-NEXT: store <4 x i16> [[TMP10]], ptr [[NEXT_GEP21]], align 2 +; CHECK-NEXT: [[INDEX_NEXT25]] = add nuw i64 [[INDEX17]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N26:%.*]] = icmp eq i64 [[N_VEC8]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N26]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END10]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL16:%.*]] = phi ptr [ [[IND_END14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: -; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL13]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 2 -; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP11:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP10]], i16 [[OFFSET]]) +; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP13:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP12]], i16 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, ptr [[PDST_ADDR_07]], i64 2 -; CHECK-NEXT: store i16 [[TMP11]], ptr [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i16 [[TMP13]], ptr [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: while.end: ; CHECK-NEXT: ret void ; @@ -131,50 +165,50 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP4]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END18:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] -; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] -; CHECK-NEXT: [[DOTCAST11:%.*]] = trunc nuw i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END12:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST11]] +; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] +; CHECK-NEXT: [[DOTCAST8:%.*]] = trunc nuw i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END9:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST8]] ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 24 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[TMP0]], 4294967288 -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC9]] to i32 -; CHECK-NEXT: [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] -; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC9]] -; CHECK-NEXT: [[IND_END17:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC9]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT25]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC7:%.*]] = and i64 [[TMP0]], 4294967288 +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC7]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] +; CHECK-NEXT: [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC7]] +; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC7]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT20]], <8 x i8> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX21:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP22:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX21]] -; CHECK-NEXT: [[NEXT_GEP23:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX21]] -; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i8>, ptr [[NEXT_GEP22]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD24]], <8 x i8> [[BROADCAST_SPLAT26]]) -; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[NEXT_GEP23]], align 2 -; CHECK-NEXT: [[INDEX_NEXT27]] = add nuw i64 [[INDEX21]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC9]] -; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[INDEX16:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX16]] +; CHECK-NEXT: [[NEXT_GEP18:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX16]] +; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x i8>, ptr [[NEXT_GEP17]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD19]], <8 x i8> [[BROADCAST_SPLAT21]]) +; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[NEXT_GEP18]], align 2 +; CHECK-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[INDEX16]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC7]] +; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[N_VEC9]], [[TMP0]] -; CHECK-NEXT: br i1 [[CMP_N20]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N23:%.*]] = icmp eq i64 [[N_VEC7]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N23]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i32 [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL16:%.*]] = phi ptr [ [[IND_END14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL19:%.*]] = phi ptr [ [[IND_END17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi ptr [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi ptr [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: -; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL13]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL19]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL15]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[PSRC_ADDR_08]], align 2 ; CHECK-NEXT: [[TMP9:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP8]], i8 [[OFFSET]]) @@ -182,7 +216,7 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK-NEXT: store i8 [[TMP9]], ptr [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: while.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll index 61bd8c51e1605..6dc66d4368b1b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -12,7 +12,7 @@ define void @add_a(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP8]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[LEN]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 @@ -39,29 +39,29 @@ define void @add_a(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX5]], 0 +; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = add <8 x i8> [[WIDE_LOAD6]], +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i8> [[WIDE_LOAD5]], ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0 -; CHECK-NEXT: store <8 x i8> [[TMP11]], ptr [[TMP13]], align 1 -; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX5]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] +; CHECK-NEXT: store <4 x i8> [[TMP11]], ptr [[TMP13]], align 1 +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -115,7 +115,7 @@ define void @add_a1(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP8]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[LEN]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 @@ -142,29 +142,29 @@ define void @add_a1(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX5]], 0 +; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw <8 x i8> [[WIDE_LOAD6]], +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw <4 x i8> [[WIDE_LOAD5]], ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0 -; CHECK-NEXT: store <8 x i8> [[TMP11]], ptr [[TMP13]], align 1 -; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX5]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] +; CHECK-NEXT: store <4 x i8> [[TMP11]], ptr [[TMP13]], align 1 +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -285,7 +285,7 @@ define void @add_c(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP8]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[LEN]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 @@ -313,30 +313,30 @@ define void @add_c(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX5]], 0 +; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, ptr [[TMP11]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = zext <8 x i8> [[WIDE_LOAD6]] to <8 x i16> -; CHECK-NEXT: [[TMP13:%.*]] = add <8 x i16> [[TMP12]], +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[WIDE_LOAD5]] to <4 x i16> +; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i16> [[TMP12]], ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP14]], i32 0 -; CHECK-NEXT: store <8 x i16> [[TMP13]], ptr [[TMP15]], align 2 -; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX5]], 8 -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] +; CHECK-NEXT: store <4 x i16> [[TMP13]], ptr [[TMP15]], align 2 +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP16]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -462,7 +462,7 @@ define void @add_e(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CONV11:%.*]] = zext i8 [[ARG2]] to i32 ; CHECK-NEXT: [[CONV13:%.*]] = zext i8 [[ARG1]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[LEN]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 @@ -502,42 +502,42 @@ define void @add_e(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT7]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT8]] to <8 x i8> -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT9]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT10]] to <8 x i8> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[CONV13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT7]] to <4 x i8> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[CONV11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT9]] to <4 x i8> ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX11]], 0 +; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX10]], 0 ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP19]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = shl <8 x i8> [[WIDE_LOAD12]], -; CHECK-NEXT: [[TMP23:%.*]] = add <8 x i8> [[TMP22]], -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i8> [[WIDE_LOAD12]], -; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i8> [[TMP24]], -; CHECK-NEXT: [[TMP26:%.*]] = and <8 x i8> [[TMP23]], [[TMP17]] -; CHECK-NEXT: [[TMP27:%.*]] = and <8 x i8> [[TMP25]], -; CHECK-NEXT: [[TMP28:%.*]] = xor <8 x i8> [[TMP27]], [[TMP18]] -; CHECK-NEXT: [[TMP29:%.*]] = mul <8 x i8> [[TMP28]], [[TMP26]] +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = shl <4 x i8> [[WIDE_LOAD11]], +; CHECK-NEXT: [[TMP23:%.*]] = add <4 x i8> [[TMP22]], +; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i8> [[WIDE_LOAD11]], +; CHECK-NEXT: [[TMP25:%.*]] = mul <4 x i8> [[TMP24]], +; CHECK-NEXT: [[TMP26:%.*]] = and <4 x i8> [[TMP23]], [[TMP17]] +; CHECK-NEXT: [[TMP27:%.*]] = and <4 x i8> [[TMP25]], +; CHECK-NEXT: [[TMP28:%.*]] = xor <4 x i8> [[TMP27]], [[TMP18]] +; CHECK-NEXT: [[TMP29:%.*]] = mul <4 x i8> [[TMP28]], [[TMP26]] ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP19]] ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0 -; CHECK-NEXT: store <8 x i8> [[TMP29]], ptr [[TMP31]], align 1 -; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX11]], 8 -; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC5]] +; CHECK-NEXT: store <4 x i8> [[TMP29]], ptr [[TMP31]], align 1 +; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], 4 +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[TMP32]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N13:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[CMP_N13]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -610,7 +610,7 @@ define void @add_f(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CONV11:%.*]] = zext i8 [[ARG2]] to i32 ; CHECK-NEXT: [[CONV13:%.*]] = zext i8 [[ARG1]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[LEN]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 @@ -652,44 +652,44 @@ define void @add_f(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT7]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT8]] to <8 x i8> -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT9]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT10]] to <8 x i8> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[CONV13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT7]] to <4 x i8> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[CONV11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT9]] to <4 x i8> ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX11]], 0 +; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX10]], 0 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i16>, ptr [[TMP23]], align 2 -; CHECK-NEXT: [[TMP24:%.*]] = trunc <8 x i16> [[WIDE_LOAD12]] to <8 x i8> -; CHECK-NEXT: [[TMP25:%.*]] = shl <8 x i8> [[TMP24]], -; CHECK-NEXT: [[TMP26:%.*]] = add <8 x i8> [[TMP25]], -; CHECK-NEXT: [[TMP27:%.*]] = and <8 x i8> [[TMP24]], -; CHECK-NEXT: [[TMP28:%.*]] = or <8 x i8> [[TMP27]], -; CHECK-NEXT: [[TMP29:%.*]] = mul <8 x i8> [[TMP28]], -; CHECK-NEXT: [[TMP30:%.*]] = and <8 x i8> [[TMP26]], [[TMP19]] -; CHECK-NEXT: [[TMP31:%.*]] = and <8 x i8> [[TMP29]], -; CHECK-NEXT: [[TMP32:%.*]] = xor <8 x i8> [[TMP31]], [[TMP20]] -; CHECK-NEXT: [[TMP33:%.*]] = mul <8 x i8> [[TMP32]], [[TMP30]] +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i16>, ptr [[TMP23]], align 2 +; CHECK-NEXT: [[TMP24:%.*]] = trunc <4 x i16> [[WIDE_LOAD11]] to <4 x i8> +; CHECK-NEXT: [[TMP25:%.*]] = shl <4 x i8> [[TMP24]], +; CHECK-NEXT: [[TMP26:%.*]] = add <4 x i8> [[TMP25]], +; CHECK-NEXT: [[TMP27:%.*]] = and <4 x i8> [[TMP24]], +; CHECK-NEXT: [[TMP28:%.*]] = or <4 x i8> [[TMP27]], +; CHECK-NEXT: [[TMP29:%.*]] = mul <4 x i8> [[TMP28]], +; CHECK-NEXT: [[TMP30:%.*]] = and <4 x i8> [[TMP26]], [[TMP19]] +; CHECK-NEXT: [[TMP31:%.*]] = and <4 x i8> [[TMP29]], +; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i8> [[TMP31]], [[TMP20]] +; CHECK-NEXT: [[TMP33:%.*]] = mul <4 x i8> [[TMP32]], [[TMP30]] ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i32 0 -; CHECK-NEXT: store <8 x i8> [[TMP33]], ptr [[TMP35]], align 1 -; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX11]], 8 -; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC5]] +; CHECK-NEXT: store <4 x i8> [[TMP33]], ptr [[TMP35]], align 1 +; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], 4 +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N13:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[CMP_N13]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -776,14 +776,14 @@ define void @add_phifail(ptr noalias nocapture readonly %p, ptr noalias nocaptur ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP4]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw <16 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP7:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw <16 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = trunc <16 x i32> [[TMP5]] to <16 x i8> +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP8]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP4]], i32 15 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -798,10 +798,10 @@ define void @add_phifail(ptr noalias nocapture readonly %p, ptr noalias nocaptur ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[CONV:%.*]], [[FOR_BODY]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[A_PHI:%.*]] = phi i32 [ [[CONV:%.*]], [[FOR_BODY]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV]] = zext i8 [[TMP11]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV]] = zext i8 [[TMP10]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2 ; CHECK-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i8 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDVARS_IV]] @@ -859,14 +859,14 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[TMP6]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw <16 x i32> [[TMP6]], -; CHECK-NEXT: [[TMP9:%.*]] = trunc <16 x i32> [[TMP8]] to <16 x i8> -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw <16 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i32> [[TMP7]] to <16 x i8> +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[TMP10]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP6]], i32 14 @@ -877,15 +877,15 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[A_PHI_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR:%.*]], [[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[A_PHI_LCSSA:%.*]] = phi i32 [ [[A_PHI:%.*]], [[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[A_PHI_LCSSA]] to i8 ; CHECK-NEXT: ret i8 [[RET]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SCALAR_RECUR]] = phi i32 [ [[CONV:%.*]], [[FOR_BODY]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[A_PHI]] = phi i32 [ [[CONV:%.*]], [[FOR_BODY]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV]] = zext i8 [[TMP13]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV]] = zext i8 [[TMP12]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2 ; CHECK-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i8 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDVARS_IV]] diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll index bae5c46d9386e..a74a2c3979e37 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll @@ -7,7 +7,7 @@ target triple = "powerpc64le-unknown-linux-gnu" define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-LABEL: define i1 @select_exit_cond( ; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ITER_CHECK:.*]]: ; CHECK-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 ; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 [[N]] @@ -15,12 +15,14 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[UMAX]], [[START2]] ; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP0]], i64 [[TMP1]]) ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[UMIN]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP2]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -110,24 +112,61 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i64> [[TMP44]], [[TMP43]] -; CHECK-NEXT: [[BIN_RDX32:%.*]] = or <2 x i64> [[TMP45]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX33:%.*]] = or <2 x i64> [[TMP46]], [[BIN_RDX32]] -; CHECK-NEXT: [[BIN_RDX34:%.*]] = or <2 x i64> [[TMP47]], [[BIN_RDX33]] -; CHECK-NEXT: [[BIN_RDX35:%.*]] = or <2 x i64> [[TMP48]], [[BIN_RDX34]] -; CHECK-NEXT: [[BIN_RDX36:%.*]] = or <2 x i64> [[TMP49]], [[BIN_RDX35]] -; CHECK-NEXT: [[BIN_RDX37:%.*]] = or <2 x i64> [[TMP50]], [[BIN_RDX36]] +; CHECK-NEXT: [[BIN_RDX18:%.*]] = or <2 x i64> [[TMP45]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX19:%.*]] = or <2 x i64> [[TMP46]], [[BIN_RDX18]] +; CHECK-NEXT: [[BIN_RDX20:%.*]] = or <2 x i64> [[TMP47]], [[BIN_RDX19]] +; CHECK-NEXT: [[BIN_RDX21:%.*]] = or <2 x i64> [[TMP48]], [[BIN_RDX20]] +; CHECK-NEXT: [[BIN_RDX22:%.*]] = or <2 x i64> [[TMP49]], [[BIN_RDX21]] +; CHECK-NEXT: [[BIN_RDX37:%.*]] = or <2 x i64> [[TMP50]], [[BIN_RDX22]] ; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[BIN_RDX37]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP52]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[IND_END27:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF24:%.*]] = urem i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_VEC25:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF24]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC25]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX29:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND30:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT31:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI32:%.*]] = phi <2 x i64> [ [[TMP55]], %[[VEC_EPILOG_PH]] ], [ [[TMP56:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP57:%.*]] = add i64 [[INDEX29]], 0 +; CHECK-NEXT: [[NEXT_GEP33:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP57]] +; CHECK-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[NEXT_GEP33]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <2 x i8>, ptr [[TMP58]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[WIDE_LOAD34]] to <2 x i64> +; CHECK-NEXT: [[TMP60:%.*]] = shl <2 x i64> [[VEC_IND30]], splat (i64 1) +; CHECK-NEXT: [[TMP61:%.*]] = shl <2 x i64> [[TMP59]], [[TMP60]] +; CHECK-NEXT: [[TMP56]] = or <2 x i64> [[TMP61]], [[VEC_PHI32]] +; CHECK-NEXT: [[INDEX_NEXT35]] = add nuw i64 [[INDEX29]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT31]] = add <2 x i64> [[VEC_IND30]], splat (i64 2) +; CHECK-NEXT: [[TMP62:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC25]] +; CHECK-NEXT: br i1 [[TMP62]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP54:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP56]]) +; CHECK-NEXT: [[CMP_N36:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC25]] +; CHECK-NEXT: br i1 [[CMP_N36]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL28:%.*]] = phi ptr [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END27]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX37:%.*]] = phi i64 [ [[TMP54]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL26]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX37]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL28]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[TMP53:%.*]] = load i8, ptr [[PTR_IV]], align 1 ; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP53]] to i64 ; CHECK-NEXT: [[MUL:%.*]] = shl i64 [[IV]], 1 @@ -138,9 +177,9 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[CMP_I166_I:%.*]] = icmp ult ptr [[PTR_IV]], [[END]] ; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i64 [[IV]], [[N]] ; CHECK-NEXT: [[AND:%.*]] = select i1 [[CMP_I166_I]], i1 [[CMP2]], i1 false -; CHECK-NEXT: br i1 [[AND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[AND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP52]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP52]], %[[MIDDLE_BLOCK]] ], [ [[TMP54]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[RES:%.*]] = icmp eq i64 [[RED_NEXT_LCSSA]], 0 ; CHECK-NEXT: ret i1 [[RES]] ; @@ -171,5 +210,6 @@ exit: ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll index 786197bfdb90a..5ccbd199e8fe2 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll @@ -4,7 +4,7 @@ ; the future we need to replace this with a more meaningful test of the ; epilogue vectorization cost-model. ; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-minimum-VF=4 -S | FileCheck %s --check-prefix=CHECK-MIN-4 -; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -S | FileCheck %s --check-prefix=CHECK-MIN-D +; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -force-vector-interleave=1 -S | FileCheck %s --check-prefix=CHECK-MIN-IC-1 target datalayout = "e-m:e-i64:64-n32:64" target triple = "powerpc64le-unknown-linux-gnu" @@ -85,14 +85,14 @@ for.end: ; preds = %for.end.loopexit, % ret void } -; Do not vectorize the epilogue for loops with VF less than the default -epilogue-vectorization-minimum-VF of 16. -; CHECK-MIN-D-LABEL: @f3 -; CHECK-MIN-D-NOT: vector.main.loop.iter.check -; CHECK-MIN-D-NOT: vec.epilog.iter.check -; CHECK-MIN-D-NOT: vec.epilog.ph -; CHECK-MIN-D-NOT: vec.epilog.vector.body -; CHECK-MIN-D-NOT: vec.epilog.middle.block -; CHECK-MIN-D: ret void +; Do not vectorize the epilogue for loops with VF*IC less than the default -epilogue-vectorization-minimum-VF of 16. +; CHECK-MIN-IC-1-LABEL: @f3 +; CHECK-MIN-IC-1-NOT: vector.main.loop.iter.check +; CHECK-MIN-IC-1-NOT: vec.epilog.iter.check +; CHECK-MIN-IC-1-NOT: vec.epilog.ph +; CHECK-MIN-IC-1-NOT: vec.epilog.vector.body +; CHECK-MIN-IC-1-NOT: vec.epilog.middle.block +; CHECK-MIN-IC-1: ret void ; Specify a smaller minimum VF (via `-epilogue-vectorization-minimum-VF=4`) and ; make sure the epilogue gets vectorized in that case. @@ -104,6 +104,15 @@ for.end: ; preds = %for.end.loopexit, % ; CHECK-MIN-4: vec.epilog.middle.block ; CHECK-MIN-4: ret void +; Default behaviour is to vectorize the epilogue for this loop. +; CHECK-LABEL: @f3 +; CHECK: vector.main.loop.iter.check +; CHECK: vec.epilog.iter.check +; CHECK: vec.epilog.ph +; CHECK: vec.epilog.vector.body +; CHECK: vec.epilog.middle.block +; CHECK: ret void + define dso_local void @f3(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 signext %N) { entry: %cmp1 = icmp sgt i32 %N, 0 @@ -134,4 +143,4 @@ for.end: ; preds = %for.end.loopexit, % } attributes #0 = { minsize } -attributes #1 = { optsize } \ No newline at end of file +attributes #1 = { optsize } diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll index 77c41453f4863..0881a69c436e5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -11,7 +11,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK: iter.check: ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -3 ; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], 32 @@ -39,34 +39,34 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END5:%.*]] = add i64 3, [[N_VEC]] ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 16 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 16 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]] ; CHECK-NEXT: [[IND_END4:%.*]] = add i64 3, [[N_VEC3]] ; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i8 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[TMP8]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <16 x i8> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i8> [[DOTSPLAT]], ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND9:%.*]] = phi <16 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX11:%.*]] = add i64 3, [[INDEX8]] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX11]], 0 +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND8:%.*]] = phi <4 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX10:%.*]] = add i64 3, [[INDEX7]] +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX10]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store <16 x i8> [[VEC_IND9]], ptr [[TMP11]], align 1 -; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX8]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT10]] = add <16 x i8> [[VEC_IND9]], -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC3]] +; CHECK-NEXT: store <4 x i8> [[VEC_IND8]], ptr [[TMP11]], align 1 +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <4 x i8> [[VEC_IND8]], +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[CMP_N7]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[CMP_N12]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 82836015204dd..d37950fd57770 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -68,25 +68,27 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-LABEL: @PR27826( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP]], label [[PREHEADER:%.*]], label [[FOR_END:%.*]] -; CHECK: preheader: +; CHECK-NEXT: br i1 [[CMP]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]] +; CHECK: iter.check: ; CHECK-NEXT: [[T0:%.*]] = sext i32 [[N]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[T0]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 5 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 32 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP119:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP120:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP121:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP122:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP120:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP121:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP122:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 32 @@ -201,9 +203,9 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-NEXT: [[TMP113:%.*]] = insertelement <4 x float> [[TMP112]], float [[TMP109]], i32 2 ; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x float> [[TMP113]], float [[TMP110]], i32 3 ; CHECK-NEXT: [[TMP115:%.*]] = fadd fast <4 x float> [[TMP42]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP116:%.*]] = fadd fast <4 x float> [[TMP50]], [[VEC_PHI1]] -; CHECK-NEXT: [[TMP117:%.*]] = fadd fast <4 x float> [[TMP58]], [[VEC_PHI2]] -; CHECK-NEXT: [[TMP118:%.*]] = fadd fast <4 x float> [[TMP66]], [[VEC_PHI3]] +; CHECK-NEXT: [[TMP116:%.*]] = fadd fast <4 x float> [[TMP50]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP117:%.*]] = fadd fast <4 x float> [[TMP58]], [[VEC_PHI3]] +; CHECK-NEXT: [[TMP118:%.*]] = fadd fast <4 x float> [[TMP66]], [[VEC_PHI4]] ; CHECK-NEXT: [[TMP119]] = fadd fast <4 x float> [[TMP115]], [[TMP90]] ; CHECK-NEXT: [[TMP120]] = fadd fast <4 x float> [[TMP116]], [[TMP98]] ; CHECK-NEXT: [[TMP121]] = fadd fast <4 x float> [[TMP117]], [[TMP106]] @@ -213,18 +215,72 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-NEXT: br i1 [[TMP123]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP120]], [[TMP119]] -; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[TMP121]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX5:%.*]] = fadd fast <4 x float> [[TMP122]], [[BIN_RDX4]] -; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX5]]) +; CHECK-NEXT: [[BIN_RDX5:%.*]] = fadd fast <4 x float> [[TMP121]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX6:%.*]] = fadd fast <4 x float> [[TMP122]], [[BIN_RDX5]] +; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX6]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP124]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[PREHEADER]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[IND_END9:%.*]] = mul i64 [[N_VEC]], 32 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP124]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_VEC8:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF7]] +; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC8]], 32 +; CHECK-NEXT: [[TMP125:%.*]] = insertelement <4 x float> zeroinitializer, float [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI11:%.*]] = phi <4 x float> [ [[TMP125]], [[VEC_EPILOG_PH]] ], [ [[TMP155:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX12:%.*]] = mul i64 [[INDEX10]], 32 +; CHECK-NEXT: [[TMP126:%.*]] = add i64 [[OFFSET_IDX12]], 0 +; CHECK-NEXT: [[TMP127:%.*]] = add i64 [[OFFSET_IDX12]], 32 +; CHECK-NEXT: [[TMP128:%.*]] = add i64 [[OFFSET_IDX12]], 64 +; CHECK-NEXT: [[TMP129:%.*]] = add i64 [[OFFSET_IDX12]], 96 +; CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP126]] +; CHECK-NEXT: [[TMP131:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP127]] +; CHECK-NEXT: [[TMP132:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP128]] +; CHECK-NEXT: [[TMP133:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP129]] +; CHECK-NEXT: [[TMP134:%.*]] = load float, ptr [[TMP130]], align 4 +; CHECK-NEXT: [[TMP135:%.*]] = load float, ptr [[TMP131]], align 4 +; CHECK-NEXT: [[TMP136:%.*]] = load float, ptr [[TMP132]], align 4 +; CHECK-NEXT: [[TMP137:%.*]] = load float, ptr [[TMP133]], align 4 +; CHECK-NEXT: [[TMP138:%.*]] = insertelement <4 x float> poison, float [[TMP134]], i32 0 +; CHECK-NEXT: [[TMP139:%.*]] = insertelement <4 x float> [[TMP138]], float [[TMP135]], i32 1 +; CHECK-NEXT: [[TMP140:%.*]] = insertelement <4 x float> [[TMP139]], float [[TMP136]], i32 2 +; CHECK-NEXT: [[TMP141:%.*]] = insertelement <4 x float> [[TMP140]], float [[TMP137]], i32 3 +; CHECK-NEXT: [[TMP142:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP126]] +; CHECK-NEXT: [[TMP143:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP127]] +; CHECK-NEXT: [[TMP144:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP128]] +; CHECK-NEXT: [[TMP145:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP129]] +; CHECK-NEXT: [[TMP146:%.*]] = load float, ptr [[TMP142]], align 4 +; CHECK-NEXT: [[TMP147:%.*]] = load float, ptr [[TMP143]], align 4 +; CHECK-NEXT: [[TMP148:%.*]] = load float, ptr [[TMP144]], align 4 +; CHECK-NEXT: [[TMP149:%.*]] = load float, ptr [[TMP145]], align 4 +; CHECK-NEXT: [[TMP150:%.*]] = insertelement <4 x float> poison, float [[TMP146]], i32 0 +; CHECK-NEXT: [[TMP151:%.*]] = insertelement <4 x float> [[TMP150]], float [[TMP147]], i32 1 +; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x float> [[TMP151]], float [[TMP148]], i32 2 +; CHECK-NEXT: [[TMP153:%.*]] = insertelement <4 x float> [[TMP152]], float [[TMP149]], i32 3 +; CHECK-NEXT: [[TMP154:%.*]] = fadd fast <4 x float> [[TMP141]], [[VEC_PHI11]] +; CHECK-NEXT: [[TMP155]] = fadd fast <4 x float> [[TMP154]], [[TMP153]] +; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 +; CHECK-NEXT: [[TMP156:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[TMP156]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP157:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP155]]) +; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[CMP_N14]], label [[LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX15:%.*]] = phi float [ [[TMP157]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ITER_CHECK]] ], [ [[TMP124]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR]] ] -; CHECK-NEXT: [[S_02:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR]] ] +; CHECK-NEXT: [[S_02:%.*]] = phi float [ [[BC_MERGE_RDX15]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[T1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] @@ -233,9 +289,9 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-NEXT: [[ADD4]] = fadd fast float [[ADD]], [[T2]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 32 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[T0]] -; CHECK-NEXT: br i1 [[CMP1]], label [[FOR]], label [[LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR]], label [[LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: loopexit: -; CHECK-NEXT: [[ADD4_LCSSA:%.*]] = phi float [ [[ADD4]], [[FOR]] ], [ [[TMP124]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD4_LCSSA:%.*]] = phi float [ [[ADD4]], [[FOR]] ], [ [[TMP124]], [[MIDDLE_BLOCK]] ], [ [[TMP157]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD4_LCSSA]], [[LOOPEXIT]] ] @@ -314,10 +370,10 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[SRC_1]], align 8, !alias.scope [[META4:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[SRC_1]], align 8, !alias.scope [[META5:![0-9]+]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP13]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT9]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META7:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META8:![0-9]+]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <2 x i64> poison, i64 [[TMP14]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT13]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT10]], zeroinitializer @@ -329,10 +385,10 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i1> [[TMP19]] to <2 x i8> ; CHECK-NEXT: [[TMP22:%.*]] = zext <2 x i1> [[TMP20]] to <2 x i8> ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i8> [[TMP22]], i32 1 -; CHECK-NEXT: store i8 [[TMP23]], ptr [[DST]], align 1, !alias.scope [[META9:![0-9]+]], !noalias [[META11:![0-9]+]] +; CHECK-NEXT: store i8 [[TMP23]], ptr [[DST]], align 1, !alias.scope [[META10:![0-9]+]], !noalias [[META12:![0-9]+]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -355,7 +411,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1 ; CHECK-NEXT: [[IV_1_NEXT_WIDE]] = zext i32 [[IV_1_NEXT]] to i64 ; CHECK-NEXT: [[EC_2:%.*]] = icmp ult i64 [[IV_1_NEXT_WIDE]], [[B]] -; CHECK-NEXT: br i1 [[EC_2]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EC_2]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -436,7 +492,7 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 { ; CHECK-NEXT: [[TMP27]] = or <2 x i1> [[VEC_PHI3]], [[TMP25]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i1> [[TMP27]], [[TMP26]] ; CHECK-NEXT: [[TMP29:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[BIN_RDX]]) @@ -456,7 +512,7 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 { ; CHECK-NEXT: [[ANY_OF_NEXT]] = select i1 [[CMP13_NOT_NOT]], i1 [[ANY_OF]], i1 false ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 40 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[PTR_IV]], [[END]] -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[ANY_OF_NEXT_LCSSA:%.*]] = phi i1 [ [[ANY_OF_NEXT]], [[LOOP]] ] ; CHECK-NEXT: ret i1 [[ANY_OF_NEXT_LCSSA]] @@ -537,10 +593,13 @@ exit: define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK-LABEL: @cost_duplicate_recipe_for_sinking( -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 @@ -548,7 +607,7 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE36:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 8 @@ -566,17 +625,17 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP13]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[TMP14]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP15]], align 8 -; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP16]], align 8 -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP17]], align 8 -; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP18]], align 8 +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP16]], align 8 +; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP17]], align 8 +; CHECK-NEXT: [[WIDE_VEC4:%.*]] = load <16 x double>, ptr [[TMP18]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x double> [[WIDE_VEC4]], <16 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC5]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC6]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC5]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC6]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC7]], zeroinitializer ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP19]], i32 0 ; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: @@ -586,147 +645,207 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1 -; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; CHECK: pred.store.if7: +; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] +; CHECK: pred.store.if8: ; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP28:%.*]] = shl nsw i64 [[TMP27]], 2 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP28]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP29]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] -; CHECK: pred.store.continue8: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]] +; CHECK: pred.store.continue9: ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2 -; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] -; CHECK: pred.store.if9: +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] +; CHECK: pred.store.if10: ; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP32:%.*]] = shl nsw i64 [[TMP31]], 2 ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP32]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP33]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] -; CHECK: pred.store.continue10: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE11]] +; CHECK: pred.store.continue11: ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] -; CHECK: pred.store.if11: +; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]] +; CHECK: pred.store.if12: ; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP36:%.*]] = shl nsw i64 [[TMP35]], 2 ; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP36]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP37]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] -; CHECK: pred.store.continue12: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE13]] +; CHECK: pred.store.continue13: ; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0 -; CHECK-NEXT: br i1 [[TMP38]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] -; CHECK: pred.store.if13: +; CHECK-NEXT: br i1 [[TMP38]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]] +; CHECK: pred.store.if14: ; CHECK-NEXT: [[TMP39:%.*]] = shl nsw i64 [[TMP4]], 2 ; CHECK-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP39]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP40]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] -; CHECK: pred.store.continue14: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE15]] +; CHECK: pred.store.continue15: ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1 -; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]] -; CHECK: pred.store.if15: +; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; CHECK: pred.store.if16: ; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[INDEX]], 5 ; CHECK-NEXT: [[TMP43:%.*]] = shl nsw i64 [[TMP42]], 2 ; CHECK-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP43]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP44]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]] -; CHECK: pred.store.continue16: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]] +; CHECK: pred.store.continue17: ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2 -; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] -; CHECK: pred.store.if17: +; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] +; CHECK: pred.store.if18: ; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP47:%.*]] = shl nsw i64 [[TMP46]], 2 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP47]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP48]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] -; CHECK: pred.store.continue18: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]] +; CHECK: pred.store.continue19: ; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3 -; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] -; CHECK: pred.store.if19: +; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] +; CHECK: pred.store.if20: ; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP51:%.*]] = shl nsw i64 [[TMP50]], 2 ; CHECK-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP51]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP52]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]] -; CHECK: pred.store.continue20: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE21]] +; CHECK: pred.store.continue21: ; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i1> [[TMP21]], i32 0 -; CHECK-NEXT: br i1 [[TMP53]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] -; CHECK: pred.store.if21: +; CHECK-NEXT: br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] +; CHECK: pred.store.if22: ; CHECK-NEXT: [[TMP54:%.*]] = shl nsw i64 [[TMP5]], 2 ; CHECK-NEXT: [[TMP55:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP54]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP55]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]] -; CHECK: pred.store.continue22: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE23]] +; CHECK: pred.store.continue23: ; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i1> [[TMP21]], i32 1 -; CHECK-NEXT: br i1 [[TMP56]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] -; CHECK: pred.store.if23: +; CHECK-NEXT: br i1 [[TMP56]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] +; CHECK: pred.store.if24: ; CHECK-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 9 ; CHECK-NEXT: [[TMP58:%.*]] = shl nsw i64 [[TMP57]], 2 ; CHECK-NEXT: [[TMP59:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP58]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP59]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]] -; CHECK: pred.store.continue24: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE25]] +; CHECK: pred.store.continue25: ; CHECK-NEXT: [[TMP60:%.*]] = extractelement <4 x i1> [[TMP21]], i32 2 -; CHECK-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] -; CHECK: pred.store.if25: +; CHECK-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] +; CHECK: pred.store.if26: ; CHECK-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 10 ; CHECK-NEXT: [[TMP62:%.*]] = shl nsw i64 [[TMP61]], 2 ; CHECK-NEXT: [[TMP63:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP62]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP63]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]] -; CHECK: pred.store.continue26: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE27]] +; CHECK: pred.store.continue27: ; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP21]], i32 3 -; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] -; CHECK: pred.store.if27: +; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] +; CHECK: pred.store.if28: ; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 11 ; CHECK-NEXT: [[TMP66:%.*]] = shl nsw i64 [[TMP65]], 2 ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP66]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP67]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]] -; CHECK: pred.store.continue28: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE29]] +; CHECK: pred.store.continue29: ; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0 -; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] -; CHECK: pred.store.if29: +; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]] +; CHECK: pred.store.if30: ; CHECK-NEXT: [[TMP69:%.*]] = shl nsw i64 [[TMP6]], 2 ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP69]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP70]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] -; CHECK: pred.store.continue30: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE31]] +; CHECK: pred.store.continue31: ; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1 -; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]] -; CHECK: pred.store.if31: +; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]] +; CHECK: pred.store.if32: ; CHECK-NEXT: [[TMP72:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP73:%.*]] = shl nsw i64 [[TMP72]], 2 ; CHECK-NEXT: [[TMP74:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP73]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP74]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE32]] -; CHECK: pred.store.continue32: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE33]] +; CHECK: pred.store.continue33: ; CHECK-NEXT: [[TMP75:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2 -; CHECK-NEXT: br i1 [[TMP75]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] -; CHECK: pred.store.if33: +; CHECK-NEXT: br i1 [[TMP75]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]] +; CHECK: pred.store.if34: ; CHECK-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP77:%.*]] = shl nsw i64 [[TMP76]], 2 ; CHECK-NEXT: [[TMP78:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP77]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP78]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE34]] -; CHECK: pred.store.continue34: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE35]] +; CHECK: pred.store.continue35: ; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3 -; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36]] -; CHECK: pred.store.if35: +; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]] +; CHECK: pred.store.if36: ; CHECK-NEXT: [[TMP80:%.*]] = add i64 [[INDEX]], 15 ; CHECK-NEXT: [[TMP81:%.*]] = shl nsw i64 [[TMP80]], 2 ; CHECK-NEXT: [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP81]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP82]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE36]] -; CHECK: pred.store.continue36: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE37]] +; CHECK: pred.store.continue37: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF38:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[N_MOD_VF38]], 0 +; CHECK-NEXT: [[TMP85:%.*]] = select i1 [[TMP84]], i64 4, i64 [[N_MOD_VF38]] +; CHECK-NEXT: [[N_VEC39:%.*]] = sub i64 [[TMP0]], [[TMP85]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX40:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT51:%.*]], [[PRED_STORE_CONTINUE50:%.*]] ] +; CHECK-NEXT: [[TMP86:%.*]] = add i64 [[INDEX40]], 0 +; CHECK-NEXT: [[TMP87:%.*]] = shl nsw i64 [[TMP86]], 2 +; CHECK-NEXT: [[TMP88:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP87]] +; CHECK-NEXT: [[TMP89:%.*]] = getelementptr double, ptr [[TMP88]], i32 0 +; CHECK-NEXT: [[WIDE_VEC41:%.*]] = load <16 x double>, ptr [[TMP89]], align 8 +; CHECK-NEXT: [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC41]], <16 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP90:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC42]], zeroinitializer +; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i1> [[TMP90]], i32 0 +; CHECK-NEXT: br i1 [[TMP91]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] +; CHECK: pred.store.if43: +; CHECK-NEXT: [[TMP92:%.*]] = shl nsw i64 [[TMP86]], 2 +; CHECK-NEXT: [[TMP93:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP92]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP93]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE44]] +; CHECK: pred.store.continue44: +; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP90]], i32 1 +; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]] +; CHECK: pred.store.if45: +; CHECK-NEXT: [[TMP95:%.*]] = add i64 [[INDEX40]], 1 +; CHECK-NEXT: [[TMP96:%.*]] = shl nsw i64 [[TMP95]], 2 +; CHECK-NEXT: [[TMP97:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP96]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP97]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE46]] +; CHECK: pred.store.continue46: +; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i1> [[TMP90]], i32 2 +; CHECK-NEXT: br i1 [[TMP98]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]] +; CHECK: pred.store.if47: +; CHECK-NEXT: [[TMP99:%.*]] = add i64 [[INDEX40]], 2 +; CHECK-NEXT: [[TMP100:%.*]] = shl nsw i64 [[TMP99]], 2 +; CHECK-NEXT: [[TMP101:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP100]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP101]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE48]] +; CHECK: pred.store.continue48: +; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i1> [[TMP90]], i32 3 +; CHECK-NEXT: br i1 [[TMP102]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50]] +; CHECK: pred.store.if49: +; CHECK-NEXT: [[TMP103:%.*]] = add i64 [[INDEX40]], 3 +; CHECK-NEXT: [[TMP104:%.*]] = shl nsw i64 [[TMP103]], 2 +; CHECK-NEXT: [[TMP105:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP104]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP105]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE50]] +; CHECK: pred.store.continue50: +; CHECK-NEXT: [[INDEX_NEXT51]] = add nuw i64 [[INDEX40]], 4 +; CHECK-NEXT: [[TMP106:%.*]] = icmp eq i64 [[INDEX_NEXT51]], [[N_VEC39]] +; CHECK-NEXT: br i1 [[TMP106]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC39]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[IV_SHL:%.*]] = shl nsw i64 [[IV]], 2 ; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[IV_SHL]] ; CHECK-NEXT: [[L:%.*]] = load double, ptr [[GEP_0]], align 8 @@ -739,7 +858,7 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -815,7 +934,7 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP18]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <2 x i64> [[TMP5]], [[BIN_RDX]] @@ -836,7 +955,7 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: tail call void @llvm.assume(i1 [[C]]) ; CHECK-NEXT: [[GEP:%.*]] = getelementptr nusw [9 x i8], ptr null, i64 [[IV_NEXT]] ; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[GEP]], [[END]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP22]], [[LOOP]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[DOTLCSSA]] @@ -859,6 +978,270 @@ exit: ret i64 %1 } +; Test case for https://github.com/llvm/llvm-project/issues/96294 with a stored +; reduction which overwrites an earlier store. +define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 { +; CHECK-LABEL: @reduction_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i1> poison, i1 [[X:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT]], <8 x i1> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i1> [[BROADCAST_SPLAT]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = lshr <8 x i64> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = trunc <8 x i64> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP6]] = and <8 x i32> [[VEC_PHI]], [[TMP5]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 24 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP6]]) +; CHECK-NEXT: store i32 [[TMP8]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[L_AND:%.*]] = and i32 [[L]], 3 +; CHECK-NEXT: store i32 [[L_AND]], ptr [[DST]], align 4 +; CHECK-NEXT: [[X_EXT:%.*]] = zext i1 [[X]] to i64 +; CHECK-NEXT: [[LSHR:%.*]] = lshr i64 [[X_EXT]], 12 +; CHECK-NEXT: [[T:%.*]] = trunc i64 [[LSHR]] to i32 +; CHECK-NEXT: [[RED_NEXT]] = and i32 [[RED]], [[T]] +; CHECK-NEXT: store i32 [[RED_NEXT]], ptr [[DST]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 29 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %l = load i32, ptr %gep.src + %l.and = and i32 %l, 3 + store i32 %l.and, ptr %dst, align 4 + %x.ext = zext i1 %x to i64 + %lshr = lshr i64 %x.ext, 12 + %t = trunc i64 %lshr to i32 + %red.next = and i32 %red, %t + store i32 %red.next, ptr %dst, align 4 + %iv.next = add i32 %iv, 1 + %ec = icmp eq i32 %iv, 29 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Test case for https://github.com/llvm/llvm-project/issues/105722. +define i64 @live_in_known_1_via_scev() { +; CHECK-LABEL: @live_in_known_1_via_scev( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SEL:%.*]] = select i1 false, i32 3, i32 0 +; CHECK-NEXT: br label [[PH:%.*]] +; CHECK: ph: +; CHECK-NEXT: [[P:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[N:%.*]] = add nuw nsw i32 [[SEL]], 6 +; CHECK-NEXT: [[P_EXT:%.*]] = zext nneg i32 [[P]] to i64 +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[P_EXT]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i32> [[VEC_IV]], +; CHECK-NEXT: [[TMP1]] = mul <4 x i64> [[VEC_PHI]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP1]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8 +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[TMP2]]) +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[PH]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 3, [[PH]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_MUL:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED_MUL]] = mul nsw i64 [[RED]], [[P_EXT]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[RED_MUL]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + %sel = select i1 false, i32 3, i32 0 + br label %ph + +ph: + %p = phi i32 [ 1, %entry ] + %N = add nuw nsw i32 %sel, 6 + %p.ext = zext nneg i32 %p to i64 + br label %loop + +loop: + %iv = phi i32 [ 0, %ph ], [ %iv.next, %loop ] + %red = phi i64 [ 3, %ph ], [ %red.mul, %loop ] + %red.mul = mul nsw i64 %red, %p.ext + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %N + br i1 %ec, label %exit, label %loop + +exit: + %res = phi i64 [ %red.mul, %loop ] + ret i64 %res +} + +; Test case for https://github.com/llvm/llvm-project/issues/107501. +define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) { +; CHECK-LABEL: @cost_loop_invariant_recipes( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[X:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i1> [[TMP0]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[BROADCAST_SPLAT2]], [[TMP1]] +; CHECK-NEXT: [[TMP3]] = mul <2 x i64> [[TMP2]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[TMP3]]) +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT_I_I_I:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_MUL:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[NOT_X:%.*]] = xor i1 [[X]], true +; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[NOT_X]] to i64 +; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[Y]], [[EXT]] +; CHECK-NEXT: [[RED_MUL]] = mul i64 [[SHL]], [[RED]] +; CHECK-NEXT: [[IV_NEXT_I_I_I]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[RED_MUL_LCSSA:%.*]] = phi i64 [ [[RED_MUL]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[RED_MUL_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next.i.i.i, %loop ] + %red = phi i64 [ 1, %entry ], [ %red.mul, %loop ] + %not.x = xor i1 %x, true + %ext = zext i1 %not.x to i64 + %shl = shl i64 %y, %ext + %red.mul = mul i64 %shl, %red + %iv.next.i.i.i = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 1 + br i1 %ec, label %exit, label %loop + +exit: + ret i64 %red.mul +} + +; Test case for https://github.com/llvm/llvm-project/issues/113526. +define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 { +; CHECK-LABEL: @narrowed_reduction( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CMP:%.*]] to i32 +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = and <16 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i32> [[VEC_PHI1]], +; CHECK-NEXT: [[TMP2:%.*]] = or <16 x i32> [[TMP0]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP1]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP6]] = zext <16 x i1> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP7]] = zext <16 x i1> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP9:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = trunc <16 x i32> [[TMP7]] to <16 x i1> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <16 x i1> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP12:%.*]] = zext i1 [[TMP11]] to i32 +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[OR13:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[OR13]], 1 +; CHECK-NEXT: [[OR]] = or i32 [[AND]], [[CONV]] +; CHECK-NEXT: [[INC]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 0 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[LOOP]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[OR_LCSSA]] +; +entry: + %conv = zext i1 %cmp to i32 + br label %loop + +loop: + %iv = phi i32 [ 1, %entry ], [ %inc, %loop ] + %or13 = phi i32 [ 0, %entry ], [ %or, %loop ] + %and = and i32 %or13, 1 + %or = or i32 %and, %conv + %inc = add i32 %iv, 1 + %ec = icmp eq i32 %iv, 0 + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %or +} + declare void @llvm.assume(i1 noundef) #0 attributes #0 = { "target-cpu"="penryn" } diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll index c1be67853bf7c..01b2e835ab6f1 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll @@ -11,7 +11,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) { ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX1]], [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[IV_START]] to i32 @@ -57,40 +57,40 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) { ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IV_START]], [[N_VEC]] ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF3:%.*]] = urem i64 [[TMP3]], 8 +; CHECK-NEXT: [[N_MOD_VF3:%.*]] = urem i64 [[TMP3]], 4 ; CHECK-NEXT: [[N_VEC4:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF3]] ; CHECK-NEXT: [[IND_END5:%.*]] = add i64 [[IV_START]], [[N_VEC4]] ; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i32 -; CHECK-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement <8 x i32> poison, i32 [[TMP20]], i64 0 -; CHECK-NEXT: [[DOTSPLAT11:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT10]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION12:%.*]] = add <8 x i32> [[DOTSPLAT11]], +; CHECK-NEXT: [[DOTSPLATINSERT9:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i64 0 +; CHECK-NEXT: [[DOTSPLAT10:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT9]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION11:%.*]] = add <4 x i32> [[DOTSPLAT10]], ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND13:%.*]] = phi <8 x i32> [ [[INDUCTION12]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX15:%.*]] = add i64 [[IV_START]], [[INDEX9]] -; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[OFFSET_IDX15]] to i32 +; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND12:%.*]] = phi <4 x i32> [ [[INDUCTION11]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX14:%.*]] = add i64 [[IV_START]], [[INDEX8]] +; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[OFFSET_IDX14]] to i32 ; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], 0 ; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], -1 -; CHECK-NEXT: [[TMP24:%.*]] = mul <8 x i32> [[VEC_IND13]], -; CHECK-NEXT: [[TMP25:%.*]] = lshr exact <8 x i32> [[TMP24]], -; CHECK-NEXT: [[TMP26:%.*]] = trunc <8 x i32> [[TMP25]] to <8 x i16> +; CHECK-NEXT: [[TMP24:%.*]] = mul <4 x i32> [[VEC_IND12]], +; CHECK-NEXT: [[TMP25:%.*]] = lshr exact <4 x i32> [[TMP24]], +; CHECK-NEXT: [[TMP26:%.*]] = trunc <4 x i32> [[TMP25]] to <4 x i16> ; CHECK-NEXT: [[TMP27:%.*]] = zext i32 [[TMP23]] to i64 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[ARR]], i64 [[TMP27]] ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[TMP28]], i32 0 -; CHECK-NEXT: store <8 x i16> [[TMP26]], ptr [[TMP29]], align 2 -; CHECK-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX9]], 8 -; CHECK-NEXT: [[VEC_IND_NEXT14]] = add <8 x i32> [[VEC_IND13]], -; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC4]] +; CHECK-NEXT: store <4 x i16> [[TMP26]], ptr [[TMP29]], align 2 +; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX8]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT13]] = add <4 x i32> [[VEC_IND12]], +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC4]] ; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC4]] -; CHECK-NEXT: br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[CMP_N16]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i64 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_SCEVCHECK]] ], [ [[IV_START]], [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] @@ -201,33 +201,33 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l, ; CHECK-NEXT: [[N_VEC8:%.*]] = sub i64 [[L]], [[N_MOD_VF7]] ; CHECK-NEXT: [[DOTCAST10:%.*]] = trunc i64 [[N_VEC8]] to i16 ; CHECK-NEXT: [[IND_END11:%.*]] = mul i16 [[DOTCAST10]], [[TMP0]] -; CHECK-NEXT: [[DOTSPLATINSERT17:%.*]] = insertelement <8 x i16> poison, i16 [[BC_RESUME_VAL]], i64 0 -; CHECK-NEXT: [[DOTSPLAT18:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT17]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLATINSERT19:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0 -; CHECK-NEXT: [[DOTSPLAT20:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT19]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = mul <8 x i16> , [[DOTSPLAT20]] -; CHECK-NEXT: [[INDUCTION21:%.*]] = add <8 x i16> [[DOTSPLAT18]], [[TMP20]] +; CHECK-NEXT: [[DOTSPLATINSERT16:%.*]] = insertelement <8 x i16> poison, i16 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT17:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT16]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT18:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[DOTSPLAT19:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT18]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = mul <8 x i16> , [[DOTSPLAT19]] +; CHECK-NEXT: [[INDUCTION20:%.*]] = add <8 x i16> [[DOTSPLAT17]], [[TMP20]] ; CHECK-NEXT: [[TMP21:%.*]] = mul i16 [[TMP0]], 8 -; CHECK-NEXT: [[DOTSPLATINSERT22:%.*]] = insertelement <8 x i16> poison, i16 [[TMP21]], i64 0 -; CHECK-NEXT: [[DOTSPLAT23:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT22]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <8 x i16> poison, i16 [[OFF]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT27]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT21:%.*]] = insertelement <8 x i16> poison, i16 [[TMP21]], i64 0 +; CHECK-NEXT: [[DOTSPLAT22:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT21]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT26:%.*]] = insertelement <8 x i16> poison, i16 [[OFF]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT27:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT26]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX16:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT29:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND24:%.*]] = phi <8 x i16> [ [[INDUCTION21]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX16]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = sub <8 x i16> [[VEC_IND24]], [[BROADCAST_SPLAT28]] +; CHECK-NEXT: [[INDEX15:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND23:%.*]] = phi <8 x i16> [ [[INDUCTION20]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX15]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = sub <8 x i16> [[VEC_IND23]], [[BROADCAST_SPLAT27]] ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP24]], i32 0 ; CHECK-NEXT: store <8 x i16> [[TMP23]], ptr [[TMP25]], align 2 -; CHECK-NEXT: [[INDEX_NEXT29]] = add nuw i64 [[INDEX16]], 8 -; CHECK-NEXT: [[VEC_IND_NEXT26]] = add <8 x i16> [[VEC_IND24]], [[DOTSPLAT23]] -; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC8]] +; CHECK-NEXT: [[INDEX_NEXT28]] = add nuw i64 [[INDEX15]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT25]] = add <8 x i16> [[VEC_IND23]], [[DOTSPLAT22]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC8]] ; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[L]], [[N_VEC8]] -; CHECK-NEXT: br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N29:%.*]] = icmp eq i64 [[L]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[CMP_N29]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi i64 [ [[N_VEC8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi i16 [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll index cb4f5f6d9eaba..182e3ea655a66 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -18,11 +18,14 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 { ; AUTO_VEC-LABEL: @fp_iv_loop1( ; AUTO_VEC-NEXT: entry: ; AUTO_VEC-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; AUTO_VEC-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] -; AUTO_VEC: for.body.preheader: +; AUTO_VEC-NEXT: br i1 [[CMP4]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]] +; AUTO_VEC: iter.check: ; AUTO_VEC-NEXT: [[ZEXT:%.*]] = zext nneg i32 [[N]] to i64 -; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 32 -; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] +; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AUTO_VEC: vector.main.loop.iter.check: +; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[N]], 32 +; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AUTO_VEC: vector.ph: ; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[ZEXT]], 2147483616 ; AUTO_VEC-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float @@ -49,16 +52,46 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 { ; AUTO_VEC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[ZEXT]] -; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY]] +; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AUTO_VEC: vec.epilog.iter.check: +; AUTO_VEC-NEXT: [[DOTCAST10:%.*]] = uitofp nneg i64 [[N_VEC]] to float +; AUTO_VEC-NEXT: [[TMP6:%.*]] = fmul fast float [[DOTCAST10]], 5.000000e-01 +; AUTO_VEC-NEXT: [[IND_END11:%.*]] = fadd fast float [[TMP6]], 1.000000e+00 +; AUTO_VEC-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[ZEXT]], 28 +; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; AUTO_VEC-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]] +; AUTO_VEC: vec.epilog.ph: +; AUTO_VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AUTO_VEC-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AUTO_VEC-NEXT: [[N_VEC6:%.*]] = and i64 [[ZEXT]], 2147483644 +; AUTO_VEC-NEXT: [[DOTCAST8:%.*]] = uitofp nneg i64 [[N_VEC6]] to float +; AUTO_VEC-NEXT: [[TMP7:%.*]] = fmul fast float [[DOTCAST8]], 5.000000e-01 +; AUTO_VEC-NEXT: [[IND_END9:%.*]] = fadd fast float [[TMP7]], 1.000000e+00 +; AUTO_VEC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[BC_RESUME_VAL]], i64 0 +; AUTO_VEC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; AUTO_VEC-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], +; AUTO_VEC-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AUTO_VEC: vec.epilog.vector.body: +; AUTO_VEC-NEXT: [[INDEX13:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND14:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX13]] +; AUTO_VEC-NEXT: store <4 x float> [[VEC_IND14]], ptr [[TMP8]], align 4 +; AUTO_VEC-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX13]], 4 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT16]] = fadd fast <4 x float> [[VEC_IND14]], +; AUTO_VEC-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC6]] +; AUTO_VEC-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AUTO_VEC: vec.epilog.middle.block: +; AUTO_VEC-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[N_VEC6]], [[ZEXT]] +; AUTO_VEC-NEXT: br i1 [[CMP_N18]], label [[FOR_END]], label [[FOR_BODY]] ; AUTO_VEC: for.body: -; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] -; AUTO_VEC-NEXT: [[X_06:%.*]] = phi float [ [[CONV1:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; AUTO_VEC-NEXT: [[X_06:%.*]] = phi float [ [[CONV1:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] ; AUTO_VEC-NEXT: store float [[X_06]], ptr [[ARRAYIDX]], align 4 ; AUTO_VEC-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; AUTO_VEC-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] -; AUTO_VEC-NEXT: br i1 [[TMP6]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AUTO_VEC-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; AUTO_VEC-NEXT: br i1 [[TMP10]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: ret void ; @@ -164,7 +197,7 @@ define void @fp_iv_loop2(ptr noalias nocapture %A, i32 %N) { ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1 ; AUTO_VEC-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 ; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP4:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP5:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: ret void ; @@ -221,12 +254,12 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) { ; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x double> [[VEC_IND]], ; AUTO_VEC-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AUTO_VEC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; AUTO_VEC-NEXT: [[CMO:%.*]] = add nsw i64 [[N_VEC]], -1 -; AUTO_VEC-NEXT: [[DOTCAST6:%.*]] = sitofp i64 [[CMO]] to double -; AUTO_VEC-NEXT: [[TMP6:%.*]] = fmul fast double [[DOTCAST6]], 3.000000e+00 +; AUTO_VEC-NEXT: [[DOTCAST5:%.*]] = sitofp i64 [[CMO]] to double +; AUTO_VEC-NEXT: [[TMP6:%.*]] = fmul fast double [[DOTCAST5]], 3.000000e+00 ; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; AUTO_VEC: for.body: ; AUTO_VEC-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] @@ -236,7 +269,7 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) { ; AUTO_VEC-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; AUTO_VEC-NEXT: [[J_NEXT]] = fadd fast double [[J]], 3.000000e+00 ; AUTO_VEC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[SMAX]] -; AUTO_VEC-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ] ; AUTO_VEC-NEXT: ret double [[J_LCSSA]] @@ -324,7 +357,7 @@ define double @external_use_without_fast_math(ptr %a, i64 %n) { ; AUTO_VEC-NEXT: [[J_NEXT_EPIL]] = fadd double [[J_EPIL]], 3.000000e+00 ; AUTO_VEC-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 ; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP8:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP9:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[J_LCSSA_PH]], [[FOR_END_UNR_LCSSA]] ], [ [[J_EPIL]], [[FOR_BODY_EPIL]] ] ; AUTO_VEC-NEXT: ret double [[J_LCSSA]] @@ -359,11 +392,14 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) { ; AUTO_VEC-LABEL: @fadd_reassoc_FMF( ; AUTO_VEC-NEXT: entry: ; AUTO_VEC-NEXT: [[CMP_NOT11:%.*]] = icmp eq i32 [[N:%.*]], 0 -; AUTO_VEC-NEXT: br i1 [[CMP_NOT11]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AUTO_VEC: for.body.preheader: +; AUTO_VEC-NEXT: br i1 [[CMP_NOT11]], label [[FOR_COND_CLEANUP:%.*]], label [[ITER_CHECK:%.*]] +; AUTO_VEC: iter.check: ; AUTO_VEC-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 32 -; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] +; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AUTO_VEC: vector.main.loop.iter.check: +; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[N]], 32 +; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AUTO_VEC: vector.ph: ; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967264 ; AUTO_VEC-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float @@ -395,23 +431,55 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) { ; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd reassoc <8 x float> [[STEP_ADD3]], ; AUTO_VEC-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AUTO_VEC-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] -; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AUTO_VEC: vec.epilog.iter.check: +; AUTO_VEC-NEXT: [[DOTCAST13:%.*]] = uitofp nneg i64 [[N_VEC]] to float +; AUTO_VEC-NEXT: [[TMP11:%.*]] = fmul reassoc float [[DOTCAST13]], 4.200000e+01 +; AUTO_VEC-NEXT: [[IND_END14:%.*]] = fadd reassoc float [[TMP11]], 1.000000e+00 +; AUTO_VEC-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 28 +; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; AUTO_VEC-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]] +; AUTO_VEC: vec.epilog.ph: +; AUTO_VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AUTO_VEC-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AUTO_VEC-NEXT: [[N_VEC9:%.*]] = and i64 [[TMP0]], 4294967292 +; AUTO_VEC-NEXT: [[DOTCAST11:%.*]] = uitofp nneg i64 [[N_VEC9]] to float +; AUTO_VEC-NEXT: [[TMP12:%.*]] = fmul reassoc float [[DOTCAST11]], 4.200000e+01 +; AUTO_VEC-NEXT: [[IND_END12:%.*]] = fadd reassoc float [[TMP12]], 1.000000e+00 +; AUTO_VEC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[BC_RESUME_VAL]], i64 0 +; AUTO_VEC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; AUTO_VEC-NEXT: [[INDUCTION:%.*]] = fadd reassoc <4 x float> [[DOTSPLAT]], +; AUTO_VEC-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AUTO_VEC: vec.epilog.vector.body: +; AUTO_VEC-NEXT: [[INDEX16:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND17:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[P]], i64 [[INDEX16]] +; AUTO_VEC-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x float>, ptr [[TMP13]], align 4 +; AUTO_VEC-NEXT: [[TMP14:%.*]] = fadd reassoc <4 x float> [[VEC_IND17]], [[WIDE_LOAD20]] +; AUTO_VEC-NEXT: store <4 x float> [[TMP14]], ptr [[TMP13]], align 4 +; AUTO_VEC-NEXT: [[INDEX_NEXT21]] = add nuw i64 [[INDEX16]], 4 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT19]] = fadd reassoc <4 x float> [[VEC_IND17]], +; AUTO_VEC-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT21]], [[N_VEC9]] +; AUTO_VEC-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; AUTO_VEC: vec.epilog.middle.block: +; AUTO_VEC-NEXT: [[CMP_N22:%.*]] = icmp eq i64 [[N_VEC9]], [[TMP0]] +; AUTO_VEC-NEXT: br i1 [[CMP_N22]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] ; AUTO_VEC: for.cond.cleanup: ; AUTO_VEC-NEXT: ret void ; AUTO_VEC: for.body: -; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] -; AUTO_VEC-NEXT: [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; AUTO_VEC-NEXT: [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P]], i64 [[INDVARS_IV]] -; AUTO_VEC-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP11]] +; AUTO_VEC-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP16]] ; AUTO_VEC-NEXT: store float [[ADD]], ptr [[ARRAYIDX]], align 4 ; AUTO_VEC-NEXT: [[ADD3]] = fadd reassoc float [[X_012]], 4.200000e+01 ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AUTO_VEC-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]] -; AUTO_VEC-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; entry: %cmp.not11 = icmp eq i32 %N, 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index 6fbe8f61cc76e..684e6d40231d8 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -6,8 +6,11 @@ target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80: define i32 @iv_used_widened_and_truncated(ptr %dst, i64 %N) #0 { ; CHECK-LABEL: define i32 @iv_used_widened_and_truncated( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -36,21 +39,53 @@ define i32 @iv_used_widened_and_truncated(ptr %dst, i64 %N) #0 { ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD2]], ; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <8 x i32> [[STEP_ADD7]], ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC9:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF8]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[BC_RESUME_VAL7]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT14:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i64 0 +; CHECK-NEXT: [[DOTSPLAT15:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT14]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION16:%.*]] = add <4 x i32> [[DOTSPLAT15]], ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT19:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VEC_IND12:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT13:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VEC_IND17:%.*]] = phi <4 x i32> [ [[INDUCTION16]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT18:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr { i32, [8 x i32] }, ptr [[DST]], <4 x i64> [[VEC_IND12]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VEC_IND17]], <4 x ptr> [[TMP7]], i32 8, <4 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX11]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT13]] = add <4 x i64> [[VEC_IND12]], +; CHECK-NEXT: [[VEC_IND_NEXT18]] = add <4 x i32> [[VEC_IND17]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[CMP_N20]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP1]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr { i32, [8 x i32] }, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: [[T:%.*]] = trunc i64 [[IV]] to i32 ; CHECK-NEXT: store i32 [[T]], ptr [[GEP]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[C]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret i32 0 ; @@ -98,19 +133,19 @@ define void @multiple_truncated_ivs_with_wide_uses(i1 %c, ptr %A, ptr %B) { ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[A]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[TMP4]], i32 4 -; CHECK-NEXT: store <4 x i16> [[TMP2]], ptr [[TMP6]], align 2, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]] -; CHECK-NEXT: store <4 x i16> [[TMP3]], ptr [[TMP7]], align 2, !alias.scope [[META4]], !noalias [[META7]] +; CHECK-NEXT: store <4 x i16> [[TMP2]], ptr [[TMP6]], align 2, !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]] +; CHECK-NEXT: store <4 x i16> [[TMP3]], ptr [[TMP7]], align 2, !alias.scope [[META5]], !noalias [[META8]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4 -; CHECK-NEXT: store <4 x i32> [[VEC_IND3]], ptr [[TMP10]], align 4, !alias.scope [[META7]] -; CHECK-NEXT: store <4 x i32> [[STEP_ADD4]], ptr [[TMP11]], align 4, !alias.scope [[META7]] +; CHECK-NEXT: store <4 x i32> [[VEC_IND3]], ptr [[TMP10]], align 4, !alias.scope [[META8]] +; CHECK-NEXT: store <4 x i32> [[STEP_ADD4]], ptr [[TMP11]], align 4, !alias.scope [[META8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], ; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <4 x i32> [[STEP_ADD4]], ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -127,7 +162,7 @@ define void @multiple_truncated_ivs_with_wide_uses(i1 %c, ptr %A, ptr %B) { ; CHECK-NEXT: store i32 [[IV_32]], ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 64 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -176,7 +211,7 @@ define void @truncated_ivs_with_wide_and_scalar_uses(i1 %c, ptr %dst) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i16> [[STEP_ADD]], ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -191,7 +226,7 @@ define void @truncated_ivs_with_wide_and_scalar_uses(i1 %c, ptr %dst) { ; CHECK-NEXT: store i16 [[SEL]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 64 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -271,50 +306,50 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 { ; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[NEXT_GEP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP23]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1, !alias.scope [[META13:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1, !alias.scope [[META14:![0-9]+]] ; CHECK-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[NEXT_GEP22]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1, !alias.scope [[META13]] +; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1, !alias.scope [[META14]] ; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> ; CHECK-NEXT: [[TMP22]] = add <16 x i32> [[TMP19]], [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP22]], <16 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = lshr <16 x i32> [[TMP23]], ; CHECK-NEXT: [[TMP25:%.*]] = trunc <16 x i32> [[TMP24]] to <16 x i8> ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[TMP25]], i32 0 -; CHECK-NEXT: store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META16:![0-9]+]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META17:![0-9]+]], !noalias [[META14]] ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP25]], i32 1 -; CHECK-NEXT: store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[TMP25]], i32 2 -; CHECK-NEXT: store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i8> [[TMP25]], i32 3 -; CHECK-NEXT: store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[TMP25]], i32 4 -; CHECK-NEXT: store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP25]], i32 5 -; CHECK-NEXT: store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[TMP25]], i32 6 -; CHECK-NEXT: store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i8> [[TMP25]], i32 7 -; CHECK-NEXT: store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[TMP25]], i32 8 -; CHECK-NEXT: store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP25]], i32 9 -; CHECK-NEXT: store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[TMP25]], i32 10 -; CHECK-NEXT: store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i8> [[TMP25]], i32 11 -; CHECK-NEXT: store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[TMP25]], i32 12 -; CHECK-NEXT: store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP25]], i32 13 -; CHECK-NEXT: store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[TMP25]], i32 14 -; CHECK-NEXT: store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i8> [[TMP25]], i32 15 -; CHECK-NEXT: store i8 [[TMP41]], ptr [[NEXT_GEP21]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP41]], ptr [[NEXT_GEP21]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967184 -; CHECK-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP22]], i32 15 ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -344,7 +379,7 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 { ; CHECK-NEXT: [[DEC]] = add i32 [[IV_1]], 1 ; CHECK-NEXT: [[OUTPTR_0]] = getelementptr i8, ptr [[PTR_IV_1]], i64 2 ; CHECK-NEXT: [[CMP30_NOT:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[CMP30_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP30_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -393,7 +428,7 @@ define i16 @iv_and_step_trunc() { ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0 @@ -410,7 +445,7 @@ define i16 @iv_and_step_trunc() { ; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[IV_NEXT]] to i16 ; CHECK-NEXT: [[REC_NEXT]] = mul i16 [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i16 [[REC_LCSSA]] @@ -435,8 +470,10 @@ exit: define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-LABEL: define i32 @test_scalar_predicated_cost( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[A:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[Y]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer @@ -444,8 +481,8 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT4]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i64> [[VEC_IND]], ; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <8 x i64> [[STEP_ADD]], ; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <8 x i64> [[STEP_ADD1]], @@ -484,12 +521,42 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD2]], ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL1]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[Y]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[LOOP_HEADER]] ] +; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[LOOP_HEADER]] ] +; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX4]], 0 +; CHECK-NEXT: [[TMP32:%.*]] = icmp ule <4 x i64> [[VEC_IND5]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[TMP32]], +; CHECK-NEXT: [[TMP34:%.*]] = or <4 x i64> [[BROADCAST_SPLAT10]], [[VEC_IND5]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = trunc <4 x i64> [[TMP34]] to <4 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP35]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr [[TMP29]], i32 4, <4 x i1> [[TMP33]]) +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX4]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 100 +; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT]], label [[SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER1:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp ule i64 [[IV]], [[Y]] @@ -503,7 +570,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 100 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret i32 0 ; @@ -584,7 +651,7 @@ define void @wide_iv_trunc(ptr %dst, i64 %N) { ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -596,7 +663,7 @@ define void @wide_iv_trunc(ptr %dst, i64 %N) { ; CHECK-NEXT: store i32 [[IV_TRUNC]], ptr [[DST]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: exit.loopexit: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -650,32 +717,267 @@ exit: ret void } +define void @wombat(i32 %arg, ptr %dst) #1 { +; CHECK-LABEL: define void @wombat( +; CHECK-SAME: i32 [[ARG:%.*]], ptr [[DST:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[ARG]], 3 +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[ARG]] to i64 +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = mul i32 60, [[ARG]] +; CHECK-NEXT: [[IND_END:%.*]] = add i32 [[MUL]], [[TMP0]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[ARG]], i64 0 +; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> , [[DOTSPLAT3]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[ARG]], 4 +; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 0 +; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 4, [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT5]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 60 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 4, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[MUL]], [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], [[SCALAR_PH]] ], [ [[TRUNC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i32, ptr [[DST]], i64 [[PHI]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[PHI2]], 12 +; CHECK-NEXT: store i32 [[AND]], ptr [[GETELEMENTPTR]], align 4 +; CHECK-NEXT: [[MUL3:%.*]] = mul i64 [[PHI]], [[ZEXT]] +; CHECK-NEXT: [[ADD]] = add i64 [[PHI]], 1 +; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65 +; CHECK-NEXT: [[TRUNC]] = trunc i64 [[MUL3]] to i32 +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %mul = mul i32 %arg, 3 + %zext = zext i32 %arg to i64 + br label %loop + +loop: + %phi = phi i64 [ 4, %entry ], [ %add, %loop ] + %phi2 = phi i32 [ %mul, %entry ], [ %trunc, %loop ] + %getelementptr = getelementptr i32, ptr %dst, i64 %phi + %and = and i32 %phi2, 12 + store i32 %and, ptr %getelementptr, align 4 + %mul3 = mul i64 %phi, %zext + %add = add i64 %phi, 1 + %icmp = icmp ugt i64 %phi, 65 + %trunc = trunc i64 %mul3 to i32 + br i1 %icmp, label %exit, label %loop + +exit: + ret void +} + +define void @wombat2(i32 %arg, ptr %dst) #1 { +; CHECK-LABEL: define void @wombat2( +; CHECK-SAME: i32 [[ARG:%.*]], ptr [[DST:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[ARG]], 3 +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[ARG]] to i64 +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = mul i32 60, [[ARG]] +; CHECK-NEXT: [[IND_END:%.*]] = add i32 [[MUL]], [[TMP0]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[ARG]], i64 0 +; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> , [[DOTSPLAT3]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[ARG]], 4 +; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 0 +; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 4, [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT5]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 60 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 4, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[MUL]], [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], [[SCALAR_PH]] ], [ [[TRUNC_1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i32, ptr [[DST]], i64 [[PHI]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[PHI2]], 12 +; CHECK-NEXT: store i32 [[AND]], ptr [[GETELEMENTPTR]], align 4 +; CHECK-NEXT: [[MUL3:%.*]] = mul i64 [[PHI]], [[ZEXT]] +; CHECK-NEXT: [[ADD]] = add i64 [[PHI]], 1 +; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65 +; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[MUL3]] to i60 +; CHECK-NEXT: [[TRUNC_1]] = trunc i60 [[TRUNC_0]] to i32 +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %mul = mul i32 %arg, 3 + %zext = zext i32 %arg to i64 + br label %loop + +loop: + %phi = phi i64 [ 4, %entry ], [ %add, %loop ] + %phi2 = phi i32 [ %mul, %entry ], [ %trunc.1, %loop ] + %getelementptr = getelementptr i32, ptr %dst, i64 %phi + %and = and i32 %phi2, 12 + store i32 %and, ptr %getelementptr, align 4 + %mul3 = mul i64 %phi, %zext + %add = add i64 %phi, 1 + %icmp = icmp ugt i64 %phi, 65 + %trunc.0 = trunc i64 %mul3 to i60 + %trunc.1 = trunc i60 %trunc.0 to i32 + br i1 %icmp, label %exit, label %loop + +exit: + ret void +} + + +define void @with_dead_use(i32 %arg, ptr %dst) #1 { +; CHECK-LABEL: define void @with_dead_use( +; CHECK-SAME: i32 [[ARG:%.*]], ptr [[DST:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[ARG]], 3 +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[ARG]] to i64 +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = mul i32 60, [[ARG]] +; CHECK-NEXT: [[IND_END:%.*]] = add i32 [[MUL]], [[TMP0]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[ARG]], i64 0 +; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> , [[DOTSPLAT3]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[ARG]], 4 +; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 0 +; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 4, [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT5]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 60 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 4, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[MUL]], [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], [[SCALAR_PH]] ], [ [[TRUNC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i32, ptr [[DST]], i64 [[PHI]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[PHI2]], 12 +; CHECK-NEXT: store i32 [[AND]], ptr [[GETELEMENTPTR]], align 4 +; CHECK-NEXT: [[MUL3:%.*]] = mul i64 [[PHI]], [[ZEXT]] +; CHECK-NEXT: [[ADD]] = add i64 [[PHI]], 1 +; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65 +; CHECK-NEXT: [[TRUNC]] = trunc i64 [[MUL3]] to i32 +; CHECK-NEXT: [[DEAD_AND:%.*]] = and i32 [[TRUNC]], 123 +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %mul = mul i32 %arg, 3 + %zext = zext i32 %arg to i64 + br label %loop + +loop: + %phi = phi i64 [ 4, %entry ], [ %add, %loop ] + %phi2 = phi i32 [ %mul, %entry ], [ %trunc, %loop ] + %getelementptr = getelementptr i32, ptr %dst, i64 %phi + %and = and i32 %phi2, 12 + store i32 %and, ptr %getelementptr, align 4 + %mul3 = mul i64 %phi, %zext + %add = add i64 %phi, 1 + %icmp = icmp ugt i64 %phi, 65 + %trunc = trunc i64 %mul3 to i32 + %dead.and = and i32 %trunc, 123 + br i1 %icmp, label %exit, label %loop + +exit: + ret void +} + attributes #0 = { "min-legal-vector-width"="0" "target-cpu"="skylake-avx512" } ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[META4]] = !{[[META5:![0-9]+]]} -; CHECK: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]} -; CHECK: [[META6]] = distinct !{[[META6]], !"LVerDomain"} -; CHECK: [[META7]] = !{[[META8:![0-9]+]]} -; CHECK: [[META8]] = distinct !{[[META8]], [[META6]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} -; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]} -; CHECK: [[META13]] = !{[[META14:![0-9]+]]} -; CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]} -; CHECK: [[META15]] = distinct !{[[META15]], !"LVerDomain"} -; CHECK: [[META16]] = !{[[META17:![0-9]+]]} -; CHECK: [[META17]] = distinct !{[[META17]], [[META15]]} -; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} -; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]]} -; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]} -; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]} -; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]} -; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} +; CHECK: [[META5]] = !{[[META6:![0-9]+]]} +; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]} +; CHECK: [[META7]] = distinct !{[[META7]], !"LVerDomain"} +; CHECK: [[META8]] = !{[[META9:![0-9]+]]} +; CHECK: [[META9]] = distinct !{[[META9]], [[META7]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK: [[META14]] = !{[[META15:![0-9]+]]} +; CHECK: [[META15]] = distinct !{[[META15]], [[META16:![0-9]+]]} +; CHECK: [[META16]] = distinct !{[[META16]], !"LVerDomain"} +; CHECK: [[META17]] = !{[[META18:![0-9]+]]} +; CHECK: [[META18]] = distinct !{[[META18]], [[META16]]} +; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]} +; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]]} +; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]} +; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]} +; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]} ; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]} ; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]} +; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]} +; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]} +; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]} +; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]} +; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]} +; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]} +; CHECK: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]} +; CHECK: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll index b5effe73fa73d..4ff5dc1bc91c6 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll @@ -27,85 +27,85 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX2:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[OFFSET_IDX2]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2 -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2 -; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD]], <16 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD9]], <16 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD10]], <16 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD11]], <16 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 32 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 64 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 96 -; CHECK-NEXT: store <16 x i16> [[TMP6]], ptr [[NEXT_GEP5]], align 2 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i16>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD]], <16 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD4]], <16 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD5]], <16 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD6]], <16 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 32 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 96 +; CHECK-NEXT: store <16 x i16> [[TMP4]], ptr [[NEXT_GEP3]], align 2 +; CHECK-NEXT: store <16 x i16> [[TMP5]], ptr [[TMP8]], align 2 +; CHECK-NEXT: store <16 x i16> [[TMP6]], ptr [[TMP9]], align 2 ; CHECK-NEXT: store <16 x i16> [[TMP7]], ptr [[TMP10]], align 2 -; CHECK-NEXT: store <16 x i16> [[TMP8]], ptr [[TMP11]], align 2 -; CHECK-NEXT: store <16 x i16> [[TMP9]], ptr [[TMP12]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END24:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END21:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP15]] -; CHECK-NEXT: [[DOTCAST17:%.*]] = trunc nuw i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END18:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST17]] +; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP13]] +; CHECK-NEXT: [[DOTCAST9:%.*]] = trunc nuw i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST9]] ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 56 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC15:%.*]] = and i64 [[TMP0]], 4294967288 -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC15]] to i32 -; CHECK-NEXT: [[IND_END16:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[N_VEC15]], 1 -; CHECK-NEXT: [[IND_END20:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[N_VEC15]], 1 -; CHECK-NEXT: [[IND_END23:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP17]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT31:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT32:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT31]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC8:%.*]] = and i64 [[TMP0]], 4294967288 +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC8]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[N_VEC8]], 1 +; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[N_VEC8]], 1 +; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP15]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT23:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT24:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT23]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT33:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP18:%.*]] = shl i64 [[INDEX27]], 1 -; CHECK-NEXT: [[NEXT_GEP28:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = shl i64 [[INDEX27]], 1 -; CHECK-NEXT: [[NEXT_GEP29:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP19]] -; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <8 x i16>, ptr [[NEXT_GEP28]], align 2 -; CHECK-NEXT: [[TMP20:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[WIDE_LOAD30]], <8 x i16> [[BROADCAST_SPLAT32]]) -; CHECK-NEXT: store <8 x i16> [[TMP20]], ptr [[NEXT_GEP29]], align 2 -; CHECK-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX27]], 8 -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC15]] -; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[INDEX17:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX18:%.*]] = shl i64 [[INDEX17]], 1 +; CHECK-NEXT: [[NEXT_GEP19:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[OFFSET_IDX18]] +; CHECK-NEXT: [[OFFSET_IDX20:%.*]] = shl i64 [[INDEX17]], 1 +; CHECK-NEXT: [[NEXT_GEP21:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[OFFSET_IDX20]] +; CHECK-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i16>, ptr [[NEXT_GEP19]], align 2 +; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[WIDE_LOAD22]], <8 x i16> [[BROADCAST_SPLAT24]]) +; CHECK-NEXT: store <8 x i16> [[TMP16]], ptr [[NEXT_GEP21]], align 2 +; CHECK-NEXT: [[INDEX_NEXT25]] = add nuw i64 [[INDEX17]], 8 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N26:%.*]] = icmp eq i64 [[N_VEC15]], [[TMP0]] +; CHECK-NEXT: [[CMP_N26:%.*]] = icmp eq i64 [[N_VEC8]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N26]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL19:%.*]] = phi i32 [ [[IND_END16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL22:%.*]] = phi ptr [ [[IND_END20]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi ptr [ [[IND_END23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END10]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL16:%.*]] = phi ptr [ [[IND_END14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: -; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL19]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL22]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL13]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 2 -; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP23:%.*]] = tail call i16 @llvm.uadd.sat.i16(i16 [[TMP22]], i16 [[OFFSET]]) +; CHECK-NEXT: [[TMP18:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP19:%.*]] = tail call i16 @llvm.uadd.sat.i16(i16 [[TMP18]], i16 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, ptr [[PDST_ADDR_07]], i64 2 -; CHECK-NEXT: store i16 [[TMP23]], ptr [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i16 [[TMP19]], ptr [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -142,7 +142,7 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK-NEXT: br i1 [[CMP_NOT6]], label [[WHILE_END:%.*]], label [[ITER_CHECK:%.*]] ; CHECK: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[BLOCKSIZE]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[BLOCKSIZE]], 128 @@ -155,22 +155,22 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <32 x i8>, ptr [[TMP1]], align 2 -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <32 x i8>, ptr [[TMP2]], align 2 -; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <32 x i8>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <32 x i8>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <32 x i8>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <32 x i8>, ptr [[TMP3]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD]], <32 x i8> [[WIDE_LOAD]], <32 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD9]], <32 x i8> [[WIDE_LOAD9]], <32 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD10]], <32 x i8> [[WIDE_LOAD10]], <32 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD11]], <32 x i8> [[WIDE_LOAD11]], <32 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 32 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 64 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 96 -; CHECK-NEXT: store <32 x i8> [[TMP4]], ptr [[NEXT_GEP5]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD3]], <32 x i8> [[WIDE_LOAD3]], <32 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD4]], <32 x i8> [[WIDE_LOAD4]], <32 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD5]], <32 x i8> [[WIDE_LOAD5]], <32 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 32 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 96 +; CHECK-NEXT: store <32 x i8> [[TMP4]], ptr [[NEXT_GEP2]], align 2 ; CHECK-NEXT: store <32 x i8> [[TMP5]], ptr [[TMP8]], align 2 ; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[TMP9]], align 2 ; CHECK-NEXT: store <32 x i8> [[TMP7]], ptr [[TMP10]], align 2 @@ -181,45 +181,45 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END24:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] -; CHECK-NEXT: [[IND_END21:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] -; CHECK-NEXT: [[DOTCAST17:%.*]] = trunc nuw i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END18:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST17]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 112 +; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] +; CHECK-NEXT: [[DOTCAST8:%.*]] = trunc nuw i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END9:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST8]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 120 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC15:%.*]] = and i64 [[TMP0]], 4294967280 -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC15]] to i32 -; CHECK-NEXT: [[IND_END16:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] -; CHECK-NEXT: [[IND_END20:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC15]] -; CHECK-NEXT: [[IND_END23:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC15]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT31:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT32:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT31]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC7:%.*]] = and i64 [[TMP0]], 4294967288 +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC7]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] +; CHECK-NEXT: [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC7]] +; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC7]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT20]], <8 x i8> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT33:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP28:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX27]] -; CHECK-NEXT: [[NEXT_GEP29:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX27]] -; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <16 x i8>, ptr [[NEXT_GEP28]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[WIDE_LOAD30]], <16 x i8> [[WIDE_LOAD30]], <16 x i8> [[BROADCAST_SPLAT32]]) -; CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[NEXT_GEP29]], align 2 -; CHECK-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX27]], 16 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC15]] +; CHECK-NEXT: [[INDEX16:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX16]] +; CHECK-NEXT: [[NEXT_GEP18:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX16]] +; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x i8>, ptr [[NEXT_GEP17]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i8> @llvm.fshl.v8i8(<8 x i8> [[WIDE_LOAD19]], <8 x i8> [[WIDE_LOAD19]], <8 x i8> [[BROADCAST_SPLAT21]]) +; CHECK-NEXT: store <8 x i8> [[TMP12]], ptr [[NEXT_GEP18]], align 2 +; CHECK-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[INDEX16]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC7]] ; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N26:%.*]] = icmp eq i64 [[N_VEC15]], [[TMP0]] -; CHECK-NEXT: br i1 [[CMP_N26]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N23:%.*]] = icmp eq i64 [[N_VEC7]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N23]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL19:%.*]] = phi i32 [ [[IND_END16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL22:%.*]] = phi ptr [ [[IND_END20]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi ptr [ [[IND_END23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi ptr [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi ptr [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: -; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL19]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL22]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL15]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 1 ; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[PSRC_ADDR_08]], align 2 ; CHECK-NEXT: [[TMP15:%.*]] = tail call i8 @llvm.fshl.i8(i8 [[TMP14]], i8 [[TMP14]], i8 [[OFFSET]]) diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll index 2b61a1cc3d78b..375e3eeeaf6fe 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -62,25 +62,25 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_VEC13:%.*]] = and i64 [[SMAX2]], 9223372036854775800 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> , i32 [[BC_MERGE_RDX]], i64 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX15:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <8 x i32> [ [[TMP11]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX15]] -; CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i32>, ptr [[TMP12]], align 8, !alias.scope [[META8:![0-9]+]] -; CHECK-NEXT: [[TMP13]] = add <8 x i32> [[VEC_PHI16]], [[WIDE_LOAD17]] +; CHECK-NEXT: [[INDEX14:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI15:%.*]] = phi <8 x i32> [ [[TMP11]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX14]] +; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, ptr [[TMP12]], align 8, !alias.scope [[META8:![0-9]+]] +; CHECK-NEXT: [[TMP13]] = add <8 x i32> [[VEC_PHI15]], [[WIDE_LOAD16]] ; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META8]] -; CHECK-NEXT: [[INDEX_NEXT18]] = add nuw i64 [[INDEX15]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT18]], [[N_VEC13]] +; CHECK-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX14]], 8 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC13]] ; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP13]]) -; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC13]] -; CHECK-NEXT: br i1 [[CMP_N14]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC13]] +; CHECK-NEXT: br i1 [[CMP_N18]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: [[BC_MERGE_RDX19:%.*]] = phi i32 [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP10]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] @@ -126,7 +126,7 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3 ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 ; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 @@ -162,32 +162,53 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX2]], 8 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX2]], 12 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[SMAX2]], 9223372036854775800 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT13]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT17]], <8 x ptr> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[SMAX2]], 9223372036854775804 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT12]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT14]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX11]] -; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP4]], align 8, !alias.scope [[META21:![0-9]+]], !noalias [[META24:![0-9]+]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD12]], [[BROADCAST_SPLAT14]] -; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT16]], ptr [[TMP4]], align 4, !alias.scope [[META21]], !noalias [[META24]] -; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[BROADCAST_SPLAT16]], <8 x ptr> [[BROADCAST_SPLAT18]], i32 4, <8 x i1> [[TMP5]]), !alias.scope [[META24]] -; CHECK-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX11]], 8 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC9]] -; CHECK-NEXT: br i1 [[TMP6]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[PRED_STORE_CONTINUE21:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX10]] +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP4]], align 8, !alias.scope [[META21:![0-9]+]], !noalias [[META24:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD11]], [[BROADCAST_SPLAT13]] +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT15]], ptr [[TMP4]], align 4, !alias.scope [[META21]], !noalias [[META24]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META24]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; CHECK: pred.store.if16: +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META24]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]] +; CHECK: pred.store.continue17: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] +; CHECK: pred.store.if18: +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META24]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]] +; CHECK: pred.store.continue19: +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3 +; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21]] +; CHECK: pred.store.if20: +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META24]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE21]] +; CHECK: pred.store.continue21: +; CHECK-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[INDEX10]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N10:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC9]] -; CHECK-NEXT: br i1 [[CMP_N10]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N23:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[CMP_N23]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -291,28 +312,28 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_VEC17:%.*]] = and i64 [[SMAX10]], 9223372036854775800 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT22:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT21]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT23:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT24:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT23]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT26:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT27:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT26]], <8 x ptr> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT20]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT22:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT23:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT22]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT25]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX19]] -; CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i32>, ptr [[TMP5]], align 8, !alias.scope [[META37:![0-9]+]], !noalias [[META40:![0-9]+]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD20]], [[BROADCAST_SPLAT22]] -; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT24]], ptr [[TMP5]], align 4, !alias.scope [[META37]], !noalias [[META40]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX19]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 8, <8 x i1> [[TMP6]], <8 x i32> poison), !alias.scope [[META43:![0-9]+]] -; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[WIDE_MASKED_LOAD25]], <8 x ptr> [[BROADCAST_SPLAT27]], i32 4, <8 x i1> [[TMP6]]), !alias.scope [[META44:![0-9]+]], !noalias [[META43]] -; CHECK-NEXT: [[INDEX_NEXT28]] = add nuw i64 [[INDEX19]], 8 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC17]] +; CHECK-NEXT: [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX18]] +; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x i32>, ptr [[TMP5]], align 8, !alias.scope [[META37:![0-9]+]], !noalias [[META40:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD19]], [[BROADCAST_SPLAT21]] +; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT23]], ptr [[TMP5]], align 4, !alias.scope [[META37]], !noalias [[META40]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX18]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 8, <8 x i1> [[TMP6]], <8 x i32> poison), !alias.scope [[META43:![0-9]+]] +; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[WIDE_MASKED_LOAD24]], <8 x ptr> [[BROADCAST_SPLAT26]], i32 4, <8 x i1> [[TMP6]]), !alias.scope [[META44:![0-9]+]], !noalias [[META43]] +; CHECK-NEXT: [[INDEX_NEXT27]] = add nuw i64 [[INDEX18]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC17]] ; CHECK-NEXT: br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[SMAX10]], [[N_VEC17]] -; CHECK-NEXT: br i1 [[CMP_N18]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N28:%.*]] = icmp eq i64 [[SMAX10]], [[N_VEC17]] +; CHECK-NEXT: br i1 [[CMP_N28]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll index 7159a54234b4b..b98864e800954 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll @@ -205,8 +205,10 @@ exit: define void @test_tc_20(ptr noalias %src, ptr noalias %dst) { ; CHECK-LABEL: @test_tc_20( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -243,19 +245,38 @@ define void @test_tc_20(ptr noalias %src, ptr noalias %dst) { ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX4]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP23]], align 64 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 0 +; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD5]], ptr [[TMP25]], align 64 +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 20 +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[LDADDR:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I]] ; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[LDADDR]], align 64 ; CHECK-NEXT: [[STADDR:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I]] ; CHECK-NEXT: store i8 [[VAL]], ptr [[STADDR]], align 64 ; CHECK-NEXT: [[I_NEXT]] = add i64 [[I]], 1 ; CHECK-NEXT: [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 20 -; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -294,7 +315,7 @@ define void @limit_main_loop_vf_to_avoid_dead_main_vector_loop(ptr noalias %src, ; CHECK-NEXT: store <8 x i8> [[STRIDED_VEC]], ptr [[TMP4]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -308,7 +329,7 @@ define void @limit_main_loop_vf_to_avoid_dead_main_vector_loop(ptr noalias %src, ; CHECK-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 32 -; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll index 2eb14de5f6fd5..5174e6493b7c1 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll @@ -6,8 +6,10 @@ target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80: define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-LABEL: define i32 @test_scalar_predicated_cost( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[Y]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer @@ -15,8 +17,8 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT4]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i64> [[VEC_IND]], ; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <8 x i64> [[STEP_ADD]], ; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <8 x i64> [[STEP_ADD1]], @@ -57,10 +59,40 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 ; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL1]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[Y]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[LOOP_HEADER]] ] +; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[LOOP_HEADER]] ] +; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX4]], 0 +; CHECK-NEXT: [[TMP32:%.*]] = icmp ule <4 x i64> [[VEC_IND5]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[TMP32]], +; CHECK-NEXT: [[TMP34:%.*]] = or <4 x i64> [[BROADCAST_SPLAT10]], [[VEC_IND5]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = trunc <4 x i64> [[TMP34]] to <4 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP35]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr [[TMP29]], i32 4, <4 x i1> [[TMP33]]) +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX4]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 100 +; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT]], label [[SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER1:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp ule i64 [[IV]], [[Y]] @@ -74,7 +106,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 100 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret i32 0 ; @@ -121,7 +153,7 @@ define void @test_scalar_cost_single_store_loop_invariant_cond(ptr %dst, i1 %c) ; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP1]], i32 4, <8 x i1> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -139,7 +171,7 @@ define void @test_scalar_cost_single_store_loop_invariant_cond(ptr %dst, i1 %c) ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 4 ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 116 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -199,7 +231,7 @@ define void @test_scalar_cost_single_store_loop_varying_cond(ptr %dst, ptr noali ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP11]], i32 4, <4 x i1> [[TMP9]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -220,7 +252,7 @@ define void @test_scalar_cost_single_store_loop_varying_cond(ptr %dst, ptr noali ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 4 ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 116 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -254,9 +286,10 @@ attributes #0 = { "min-legal-vector-width"="0" "target-cpu"="skylake-avx512" } ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index aea72b7de5f42..5b52d15cc7dda 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -76,18 +76,20 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @foo1( -; AVX2-NEXT: entry: +; AVX2-NEXT: iter.check: ; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64 ; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64 ; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 -; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: ; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] ; AVX2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128 ; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]] ; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128 ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] -; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX2: vector.main.loop.iter.check: +; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: @@ -144,27 +146,51 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX2-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX2-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX2: middle.block: -; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX2: vec.epilog.iter.check: +; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX2: vec.epilog.ph: +; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX2-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AVX2: vec.epilog.vector.body: +; AVX2-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX2-NEXT: [[TMP39:%.*]] = add i64 [[INDEX11]], 0 +; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP39]] +; AVX2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP40]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP41]], align 4 +; AVX2-NEXT: [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], +; AVX2-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP39]] +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr [[TMP43]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP44]], i32 4, <8 x i1> [[TMP42]], <8 x i32> poison) +; AVX2-NEXT: [[TMP45:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]] +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP39]] +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr i32, ptr [[TMP46]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP45]], ptr [[TMP47]], i32 4, <8 x i1> [[TMP42]]) +; AVX2-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8 +; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000 +; AVX2-NEXT: br i1 [[TMP48]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX2: vec.epilog.middle.block: +; AVX2-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX2: vec.epilog.scalar.ph: +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: -; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100 +; AVX2-NEXT: [[TMP49:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX2: if.then: ; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP40:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 -; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP40]], [[TMP39]] +; AVX2-NEXT: [[TMP50:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]] ; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] ; AVX2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -246,21 +272,21 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; AVX512: vec.epilog.vector.body: -; AVX512-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP39:%.*]] = add i64 [[INDEX12]], 0 +; AVX512-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP39:%.*]] = add i64 [[INDEX11]], 0 ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP39]] ; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP40]], i32 0 -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP41]], align 4 -; AVX512-NEXT: [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP41]], align 4 +; AVX512-NEXT: [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP39]] ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr [[TMP43]], i32 0 -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP44]], i32 4, <8 x i1> [[TMP42]], <8 x i32> poison) -; AVX512-NEXT: [[TMP45:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP44]], i32 4, <16 x i1> [[TMP42]], <16 x i32> poison) +; AVX512-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]] ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP39]] ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr i32, ptr [[TMP46]], i32 0 -; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP45]], ptr [[TMP47]], i32 4, <8 x i1> [[TMP42]]) -; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8 -; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP45]], ptr [[TMP47]], i32 4, <16 x i1> [[TMP42]]) +; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16 +; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000 ; AVX512-NEXT: br i1 [[TMP48]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; AVX512: vec.epilog.middle.block: ; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] @@ -375,18 +401,20 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @foo1_addrspace1( -; AVX2-NEXT: entry: +; AVX2-NEXT: iter.check: ; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64 ; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64 ; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64 -; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: ; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] ; AVX2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128 ; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]] ; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128 ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] -; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX2: vector.main.loop.iter.check: +; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: @@ -441,29 +469,53 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP29]], ptr addrspace(1) [[TMP37]], i32 4, <8 x i1> [[TMP17]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; AVX2: middle.block: -; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX2: vec.epilog.iter.check: +; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX2: vec.epilog.ph: +; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX2-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AVX2: vec.epilog.vector.body: +; AVX2-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX2-NEXT: [[TMP39:%.*]] = add i64 [[INDEX11]], 0 +; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP39]] +; AVX2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP40]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP41]], align 4 +; AVX2-NEXT: [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], +; AVX2-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP39]] +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP43]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP44]], i32 4, <8 x i1> [[TMP42]], <8 x i32> poison) +; AVX2-NEXT: [[TMP45:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]] +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP39]] +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP46]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP45]], ptr addrspace(1) [[TMP47]], i32 4, <8 x i1> [[TMP42]]) +; AVX2-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8 +; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000 +; AVX2-NEXT: br i1 [[TMP48]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; AVX2: vec.epilog.middle.block: +; AVX2-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX2: vec.epilog.scalar.ph: +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: -; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP39:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4 -; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100 +; AVX2-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX2: if.then: ; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[B]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP40:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4 -; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP40]], [[TMP39]] +; AVX2-NEXT: [[TMP50:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4 +; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]] ; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i64 [[INDVARS_IV]] ; AVX2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[ARRAYIDX7]], align 4 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -545,21 +597,21 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; AVX512: vec.epilog.vector.body: -; AVX512-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP39:%.*]] = add i64 [[INDEX12]], 0 +; AVX512-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP39:%.*]] = add i64 [[INDEX11]], 0 ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP39]] ; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP40]], i32 0 -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP41]], align 4 -; AVX512-NEXT: [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP41]], align 4 +; AVX512-NEXT: [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP39]] ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP43]], i32 0 -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP44]], i32 4, <8 x i1> [[TMP42]], <8 x i32> poison) -; AVX512-NEXT: [[TMP45:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP44]], i32 4, <16 x i1> [[TMP42]], <16 x i32> poison) +; AVX512-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]] ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP39]] ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP46]], i32 0 -; AVX512-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP45]], ptr addrspace(1) [[TMP47]], i32 4, <8 x i1> [[TMP42]]) -; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8 -; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP45]], ptr addrspace(1) [[TMP47]], i32 4, <16 x i1> [[TMP42]]) +; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16 +; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000 ; AVX512-NEXT: br i1 [[TMP48]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; AVX512: vec.epilog.middle.block: ; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] @@ -685,18 +737,20 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @foo2( -; AVX2-NEXT: entry: +; AVX2-NEXT: iter.check: ; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64 ; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64 ; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 -; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: ; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] ; AVX2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128 ; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]] ; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128 ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] -; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX2: vector.main.loop.iter.check: +; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: @@ -755,30 +809,55 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP33]], ptr [[TMP41]], i32 4, <8 x i1> [[TMP17]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; AVX2: middle.block: -; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX2: vec.epilog.iter.check: +; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX2: vec.epilog.ph: +; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX2-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AVX2: vec.epilog.vector.body: +; AVX2-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX2-NEXT: [[TMP43:%.*]] = add i64 [[INDEX11]], 0 +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP43]] +; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP45]], align 4 +; AVX2-NEXT: [[TMP46:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP43]] +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr float, ptr [[TMP47]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP48]], i32 4, <8 x i1> [[TMP46]], <8 x float> poison) +; AVX2-NEXT: [[TMP49:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float> +; AVX2-NEXT: [[TMP50:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD13]], [[TMP49]] +; AVX2-NEXT: [[TMP51:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP43]] +; AVX2-NEXT: [[TMP52:%.*]] = getelementptr float, ptr [[TMP51]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP50]], ptr [[TMP52]], i32 4, <8 x i1> [[TMP46]]) +; AVX2-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8 +; AVX2-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000 +; AVX2-NEXT: br i1 [[TMP53]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; AVX2: vec.epilog.middle.block: +; AVX2-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX2: vec.epilog.scalar.ph: +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: -; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP43]], 100 +; AVX2-NEXT: [[TMP54:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP54]], 100 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX2: if.then: ; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP44:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 -; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP43]] to float -; AVX2-NEXT: [[ADD:%.*]] = fadd float [[TMP44]], [[CONV]] +; AVX2-NEXT: [[TMP55:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 +; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP54]] to float +; AVX2-NEXT: [[ADD:%.*]] = fadd float [[TMP55]], [[CONV]] ; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] ; AVX2-NEXT: store float [[ADD]], ptr [[ARRAYIDX7]], align 4 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -864,22 +943,22 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; AVX512: vec.epilog.vector.body: -; AVX512-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP43:%.*]] = add i64 [[INDEX12]], 0 +; AVX512-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP43:%.*]] = add i64 [[INDEX11]], 0 ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP43]] ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 0 -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP45]], align 4 -; AVX512-NEXT: [[TMP46:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP45]], align 4 +; AVX512-NEXT: [[TMP46:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP43]] ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr float, ptr [[TMP47]], i32 0 -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP48]], i32 4, <8 x i1> [[TMP46]], <8 x float> poison) -; AVX512-NEXT: [[TMP49:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float> -; AVX512-NEXT: [[TMP50:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD14]], [[TMP49]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP48]], i32 4, <16 x i1> [[TMP46]], <16 x float> poison) +; AVX512-NEXT: [[TMP49:%.*]] = sitofp <16 x i32> [[WIDE_LOAD12]] to <16 x float> +; AVX512-NEXT: [[TMP50:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD13]], [[TMP49]] ; AVX512-NEXT: [[TMP51:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP43]] ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr float, ptr [[TMP51]], i32 0 -; AVX512-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP50]], ptr [[TMP52]], i32 4, <8 x i1> [[TMP46]]) -; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8 -; AVX512-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP50]], ptr [[TMP52]], i32 4, <16 x i1> [[TMP46]]) +; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16 +; AVX512-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000 ; AVX512-NEXT: br i1 [[TMP53]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; AVX512: vec.epilog.middle.block: ; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] @@ -947,121 +1026,223 @@ for.end: ; preds = %for.inc ;} define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 { -; AVX-LABEL: @foo3( -; AVX-NEXT: entry: -; AVX-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; AVX: vector.memcheck: -; AVX-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 -; AVX-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 -; AVX-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 -; AVX-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]] -; AVX-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] -; AVX-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]] -; AVX-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]] -; AVX-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] -; AVX-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] -; AVX-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; AVX: vector.ph: -; AVX-NEXT: br label [[VECTOR_BODY:%.*]] -; AVX: vector.body: -; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; AVX-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 -; AVX-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]] -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]] -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4 -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12 -; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4, !alias.scope !8 -; AVX-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope !8 -; AVX-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4, !alias.scope !8 -; AVX-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope !8 -; AVX-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], -; AVX-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], -; AVX-NEXT: [[TMP14:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD7]], -; AVX-NEXT: [[TMP15:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD8]], -; AVX-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]] -; AVX-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP1]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP2]] -; AVX-NEXT: [[TMP19:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP3]] -; AVX-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[TMP16]], i32 0 -; AVX-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP16]], i32 4 -; AVX-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[TMP16]], i32 8 -; AVX-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[TMP16]], i32 12 -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP12]], <4 x double> poison), !alias.scope !11 -; AVX-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> poison), !alias.scope !11 -; AVX-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP14]], <4 x double> poison), !alias.scope !11 -; AVX-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double> poison), !alias.scope !11 -; AVX-NEXT: [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> -; AVX-NEXT: [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double> -; AVX-NEXT: [[TMP26:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double> -; AVX-NEXT: [[TMP27:%.*]] = sitofp <4 x i32> [[WIDE_LOAD8]] to <4 x double> -; AVX-NEXT: [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP24]] -; AVX-NEXT: [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD9]], [[TMP25]] -; AVX-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD10]], [[TMP26]] -; AVX-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], [[TMP27]] -; AVX-NEXT: [[TMP32:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]] -; AVX-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP1]] -; AVX-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP2]] -; AVX-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]] -; AVX-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP32]], i32 0 -; AVX-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP32]], i32 4 -; AVX-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP32]], i32 8 -; AVX-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP32]], i32 12 -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP28]], ptr [[TMP36]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !13, !noalias !15 -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP29]], ptr [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !13, !noalias !15 -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP30]], ptr [[TMP38]], i32 8, <4 x i1> [[TMP14]]), !alias.scope !13, !noalias !15 -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP31]], ptr [[TMP39]], i32 8, <4 x i1> [[TMP15]]), !alias.scope !13, !noalias !15 -; AVX-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] -; AVX: middle.block: -; AVX-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; AVX: scalar.ph: -; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; AVX-NEXT: br label [[FOR_BODY:%.*]] -; AVX: for.body: -; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; AVX-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP41]], 100 -; AVX-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; AVX: if.then: -; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP42:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 -; AVX-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP41]] to double -; AVX-NEXT: [[ADD:%.*]] = fadd double [[TMP42]], [[CONV]] -; AVX-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] -; AVX-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 -; AVX-NEXT: br label [[FOR_INC]] -; AVX: for.inc: -; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] -; AVX: for.end: -; AVX-NEXT: ret void +; AVX1-LABEL: @foo3( +; AVX1-NEXT: entry: +; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; AVX1: vector.memcheck: +; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 +; AVX1-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 +; AVX1-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 +; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; AVX1-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] +; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX1-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] +; AVX1-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] +; AVX1-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX1: vector.ph: +; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX1: vector.body: +; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; AVX1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; AVX1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4 +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4, !alias.scope [[META8:![0-9]+]] +; AVX1-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META8]] +; AVX1-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META8]] +; AVX1-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope [[META8]] +; AVX1-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], +; AVX1-NEXT: [[TMP14:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD7]], +; AVX1-NEXT: [[TMP15:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD8]], +; AVX1-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP19:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[TMP16]], i32 0 +; AVX1-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP16]], i32 4 +; AVX1-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[TMP16]], i32 8 +; AVX1-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[TMP16]], i32 12 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP12]], <4 x double> poison), !alias.scope [[META11:![0-9]+]] +; AVX1-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> poison), !alias.scope [[META11]] +; AVX1-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP14]], <4 x double> poison), !alias.scope [[META11]] +; AVX1-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double> poison), !alias.scope [[META11]] +; AVX1-NEXT: [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> +; AVX1-NEXT: [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double> +; AVX1-NEXT: [[TMP26:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double> +; AVX1-NEXT: [[TMP27:%.*]] = sitofp <4 x i32> [[WIDE_LOAD8]] to <4 x double> +; AVX1-NEXT: [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP24]] +; AVX1-NEXT: [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD9]], [[TMP25]] +; AVX1-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD10]], [[TMP26]] +; AVX1-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], [[TMP27]] +; AVX1-NEXT: [[TMP32:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP32]], i32 0 +; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP32]], i32 4 +; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP32]], i32 8 +; AVX1-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP32]], i32 12 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP28]], ptr [[TMP36]], i32 8, <4 x i1> [[TMP12]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]] +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP29]], ptr [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope [[META13]], !noalias [[META15]] +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP30]], ptr [[TMP38]], i32 8, <4 x i1> [[TMP14]]), !alias.scope [[META13]], !noalias [[META15]] +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP31]], ptr [[TMP39]], i32 8, <4 x i1> [[TMP15]]), !alias.scope [[META13]], !noalias [[META15]] +; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; AVX1-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; AVX1-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; AVX1: middle.block: +; AVX1-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; AVX1: scalar.ph: +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX1-NEXT: br label [[FOR_BODY:%.*]] +; AVX1: for.body: +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP41]], 100 +; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; AVX1: if.then: +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP42:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP41]] to double +; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP42]], [[CONV]] +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 +; AVX1-NEXT: br label [[FOR_INC]] +; AVX1: for.inc: +; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; AVX1: for.end: +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @foo3( +; AVX2-NEXT: entry: +; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; AVX2: vector.memcheck: +; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 +; AVX2-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 +; AVX2-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 +; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] +; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX2-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] +; AVX2-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] +; AVX2-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX2: vector.ph: +; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4 +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4, !alias.scope [[META11:![0-9]+]] +; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META11]] +; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META11]] +; AVX2-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope [[META11]] +; AVX2-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], +; AVX2-NEXT: [[TMP14:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD7]], +; AVX2-NEXT: [[TMP15:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD8]], +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[TMP16]], i32 0 +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP16]], i32 4 +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[TMP16]], i32 8 +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[TMP16]], i32 12 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP12]], <4 x double> poison), !alias.scope [[META14:![0-9]+]] +; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> poison), !alias.scope [[META14]] +; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP14]], <4 x double> poison), !alias.scope [[META14]] +; AVX2-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double> poison), !alias.scope [[META14]] +; AVX2-NEXT: [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> +; AVX2-NEXT: [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double> +; AVX2-NEXT: [[TMP26:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double> +; AVX2-NEXT: [[TMP27:%.*]] = sitofp <4 x i32> [[WIDE_LOAD8]] to <4 x double> +; AVX2-NEXT: [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP24]] +; AVX2-NEXT: [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD9]], [[TMP25]] +; AVX2-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD10]], [[TMP26]] +; AVX2-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], [[TMP27]] +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP32]], i32 0 +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP32]], i32 4 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP32]], i32 8 +; AVX2-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP32]], i32 12 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP28]], ptr [[TMP36]], i32 8, <4 x i1> [[TMP12]]), !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]] +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP29]], ptr [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope [[META16]], !noalias [[META18]] +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP30]], ptr [[TMP38]], i32 8, <4 x i1> [[TMP14]]), !alias.scope [[META16]], !noalias [[META18]] +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP31]], ptr [[TMP39]], i32 8, <4 x i1> [[TMP15]]), !alias.scope [[META16]], !noalias [[META18]] +; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; AVX2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; AVX2-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; AVX2: middle.block: +; AVX2-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; AVX2: scalar.ph: +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: for.body: +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP41]], 100 +; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; AVX2: if.then: +; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP42:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP41]] to double +; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP42]], [[CONV]] +; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 +; AVX2-NEXT: br label [[FOR_INC]] +; AVX2: for.inc: +; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; AVX2: for.end: +; AVX2-NEXT: ret void ; ; AVX512-LABEL: @foo3( -; AVX512-NEXT: entry: -; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; AVX512-NEXT: iter.check: +; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 -; AVX512-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 -; AVX512-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 +; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 +; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]] -; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]] +; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] +; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] -; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX512: vector.main.loop.iter.check: +; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: @@ -1078,10 +1259,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 24 -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4, !alias.scope !11 -; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope !11 -; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4, !alias.scope !11 -; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4, !alias.scope !11 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4, !alias.scope [[META11:![0-9]+]] +; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META11]] +; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META11]] +; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4, !alias.scope [[META11]] ; AVX512-NEXT: [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX512-NEXT: [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], ; AVX512-NEXT: [[TMP14:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], @@ -1094,10 +1275,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP16]], i32 8 ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[TMP16]], i32 16 ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[TMP16]], i32 24 -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP12]], <8 x double> poison), !alias.scope !14 -; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> poison), !alias.scope !14 -; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP14]], <8 x double> poison), !alias.scope !14 -; AVX512-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double> poison), !alias.scope !14 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP12]], <8 x double> poison), !alias.scope [[META14:![0-9]+]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> poison), !alias.scope [[META14]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP14]], <8 x double> poison), !alias.scope [[META14]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double> poison), !alias.scope [[META14]] ; AVX512-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double> ; AVX512-NEXT: [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x double> ; AVX512-NEXT: [[TMP26:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x double> @@ -1114,36 +1295,61 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP32]], i32 8 ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP32]], i32 16 ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP32]], i32 24 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP28]], ptr [[TMP36]], i32 8, <8 x i1> [[TMP12]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP29]], ptr [[TMP37]], i32 8, <8 x i1> [[TMP13]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP30]], ptr [[TMP38]], i32 8, <8 x i1> [[TMP14]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP31]], ptr [[TMP39]], i32 8, <8 x i1> [[TMP15]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP28]], ptr [[TMP36]], i32 8, <8 x i1> [[TMP12]]), !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]] +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP29]], ptr [[TMP37]], i32 8, <8 x i1> [[TMP13]]), !alias.scope [[META16]], !noalias [[META18]] +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP30]], ptr [[TMP38]], i32 8, <8 x i1> [[TMP14]]), !alias.scope [[META16]], !noalias [[META18]] +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP31]], ptr [[TMP39]], i32 8, <8 x i1> [[TMP15]]), !alias.scope [[META16]], !noalias [[META18]] ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; AVX512: middle.block: -; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX512: vec.epilog.iter.check: +; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX512: vec.epilog.ph: +; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AVX512: vec.epilog.vector.body: +; AVX512-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP41:%.*]] = add i64 [[INDEX12]], 0 +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP41]] +; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP43]], align 4, !alias.scope [[META20:![0-9]+]] +; AVX512-NEXT: [[TMP44:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP41]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[TMP45]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP46]], i32 8, <8 x i1> [[TMP44]], <8 x double> poison), !alias.scope [[META23:![0-9]+]] +; AVX512-NEXT: [[TMP47:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double> +; AVX512-NEXT: [[TMP48:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD14]], [[TMP47]] +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP41]] +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP49]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP48]], ptr [[TMP50]], i32 8, <8 x i1> [[TMP44]]), !alias.scope [[META25:![0-9]+]], !noalias [[META27:![0-9]+]] +; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8 +; AVX512-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 +; AVX512-NEXT: br i1 [[TMP51]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; AVX512: vec.epilog.middle.block: +; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX512: vec.epilog.scalar.ph: +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: -; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP41]], 100 +; AVX512-NEXT: [[TMP52:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP52]], 100 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP42:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 -; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP41]] to double -; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP42]], [[CONV]] +; AVX512-NEXT: [[TMP53:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP52]] to double +; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP53]], [[CONV]] ; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] ; AVX512-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1216,14 +1422,14 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512-NEXT: entry: ; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880 -; AVX512-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940 -; AVX512-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752 -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880 +; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940 +; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752 +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]] -; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]] +; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] +; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] @@ -1233,19 +1439,19 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <8 x i64> [[VEC_IND]] -; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP0]], i32 4, <8 x i1> , <8 x i32> poison), !alias.scope !21 +; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP0]], i32 4, <8 x i1> , <8 x i32> poison), !alias.scope [[META30:![0-9]+]] ; AVX512-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], ; AVX512-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[B]], <8 x i64> [[TMP2]] -; AVX512-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope !24 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope [[META33:![0-9]+]] ; AVX512-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double> ; AVX512-NEXT: [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER6]], [[TMP4]] ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[A]], <8 x i64> [[VEC_IND]] -; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope [[META35:![0-9]+]], !noalias [[META37:![0-9]+]] ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], ; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624 -; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: @@ -1269,7 +1475,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP30:![0-9]+]] +; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP39:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1343,14 +1549,14 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX2-NEXT: entry: ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: -; AVX2-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768 -; AVX2-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384 -; AVX2-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768 -; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP1]] -; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] +; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768 +; AVX2-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384 +; AVX2-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768 +; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP1]] +; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] ; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX2-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP2]] -; AVX2-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[UGLYGEP]] +; AVX2-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP2]] +; AVX2-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[SCEVGEP]] ; AVX2-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] @@ -1375,13 +1581,13 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 -3 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -12 ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 -3 -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope !18 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META21:![0-9]+]] ; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope !18 +; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope [[META21]] ; AVX2-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD6]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4, !alias.scope !18 +; AVX2-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4, !alias.scope [[META21]] ; AVX2-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD8]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope !18 +; AVX2-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope [[META21]] ; AVX2-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD10]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer ; AVX2-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE7]], zeroinitializer @@ -1400,16 +1606,16 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -12 ; AVX2-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -3 ; AVX2-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP25]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope !21 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP25]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META24:![0-9]+]] ; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP27]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope !21 +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP27]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope [[META24]] ; AVX2-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD15]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP18]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope !21 +; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope [[META24]] ; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP19]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !21 +; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope [[META24]] ; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[REVERSE13]], ; AVX2-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[REVERSE16]], @@ -1427,17 +1633,17 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[TMP44]], i32 -3 ; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[TMP36]], i32 -12 ; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[TMP46]], i32 -3 -; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x double> [[TMP32]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE23]], ptr [[TMP41]], i32 8, <4 x i1> [[REVERSE12]]), !alias.scope !23, !noalias !25 -; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x double> [[TMP33]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE25]], ptr [[TMP43]], i32 8, <4 x i1> [[REVERSE14]]), !alias.scope !23, !noalias !25 -; AVX2-NEXT: [[REVERSE27:%.*]] = shufflevector <4 x double> [[TMP34]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE27]], ptr [[TMP45]], i32 8, <4 x i1> [[REVERSE17]]), !alias.scope !23, !noalias !25 -; AVX2-NEXT: [[REVERSE29:%.*]] = shufflevector <4 x double> [[TMP35]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE29]], ptr [[TMP47]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !23, !noalias !25 +; AVX2-NEXT: [[REVERSE24:%.*]] = shufflevector <4 x double> [[TMP32]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE24]], ptr [[TMP41]], i32 8, <4 x i1> [[REVERSE12]]), !alias.scope [[META26:![0-9]+]], !noalias [[META28:![0-9]+]] +; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x double> [[TMP33]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE26]], ptr [[TMP43]], i32 8, <4 x i1> [[REVERSE14]]), !alias.scope [[META26]], !noalias [[META28]] +; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x double> [[TMP34]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE28]], ptr [[TMP45]], i32 8, <4 x i1> [[REVERSE17]]), !alias.scope [[META26]], !noalias [[META28]] +; AVX2-NEXT: [[REVERSE30:%.*]] = shufflevector <4 x double> [[TMP35]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE30]], ptr [[TMP47]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope [[META26]], !noalias [[META28]] ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX2: scalar.ph: @@ -1459,7 +1665,7 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX2-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -1467,14 +1673,14 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX512-NEXT: entry: ; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768 -; AVX512-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384 -; AVX512-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768 -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP1]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768 +; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384 +; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768 +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP1]] +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP2]] -; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[UGLYGEP]] +; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP2]] +; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[SCEVGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] @@ -1499,13 +1705,13 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 -7 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -24 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 -7 -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope !31 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META40:![0-9]+]] ; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4, !alias.scope !31 +; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4, !alias.scope [[META40]] ; AVX512-NEXT: [[REVERSE7:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD6]], <8 x i32> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4, !alias.scope !31 +; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4, !alias.scope [[META40]] ; AVX512-NEXT: [[REVERSE9:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD8]], <8 x i32> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP15]], align 4, !alias.scope !31 +; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP15]], align 4, !alias.scope [[META40]] ; AVX512-NEXT: [[REVERSE11:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD10]], <8 x i32> poison, <8 x i32> ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <8 x i32> [[REVERSE7]], zeroinitializer @@ -1524,16 +1730,16 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -24 ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -7 ; AVX512-NEXT: [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP25]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP25]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META43:![0-9]+]] ; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP27]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP27]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope [[META43]] ; AVX512-NEXT: [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD15]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP18]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META43]] ; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP19]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope [[META43]] ; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP32:%.*]] = fadd <8 x double> [[REVERSE13]], ; AVX512-NEXT: [[TMP33:%.*]] = fadd <8 x double> [[REVERSE16]], @@ -1551,17 +1757,17 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[TMP44]], i32 -7 ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[TMP36]], i32 -24 ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[TMP46]], i32 -7 -; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x double> [[TMP32]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE23]], ptr [[TMP41]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope !36, !noalias !38 -; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x double> [[TMP33]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE25]], ptr [[TMP43]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope !36, !noalias !38 -; AVX512-NEXT: [[REVERSE27:%.*]] = shufflevector <8 x double> [[TMP34]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE27]], ptr [[TMP45]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope !36, !noalias !38 -; AVX512-NEXT: [[REVERSE29:%.*]] = shufflevector <8 x double> [[TMP35]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE29]], ptr [[TMP47]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: [[REVERSE24:%.*]] = shufflevector <8 x double> [[TMP32]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE24]], ptr [[TMP41]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope [[META45:![0-9]+]], !noalias [[META47:![0-9]+]] +; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x double> [[TMP33]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE26]], ptr [[TMP43]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope [[META45]], !noalias [[META47]] +; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x double> [[TMP34]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE28]], ptr [[TMP45]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope [[META45]], !noalias [[META47]] +; AVX512-NEXT: [[REVERSE30:%.*]] = shufflevector <8 x double> [[TMP35]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE30]], ptr [[TMP47]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope [[META45]], !noalias [[META47]] ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: @@ -1583,7 +1789,7 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX512-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1626,11 +1832,14 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-LABEL: @foo7( ; AVX1-NEXT: entry: ; AVX1-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 -; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX1: for.body.preheader: +; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[ITER_CHECK:%.*]] +; AVX1: iter.check: ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 -; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 -; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX1: vector.main.loop.iter.check: +; AVX1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: ; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] @@ -1650,77 +1859,110 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8 ; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12 ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1 -; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 -; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 -; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 +; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; AVX1-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 ; AVX1-NEXT: [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], -; AVX1-NEXT: [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD2]], -; AVX1-NEXT: [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD3]], +; AVX1-NEXT: [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD2]], +; AVX1-NEXT: [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD3]], +; AVX1-NEXT: [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD4]], ; AVX1-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[TMP12]], zeroinitializer ; AVX1-NEXT: [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer ; AVX1-NEXT: [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer ; AVX1-NEXT: [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer -; AVX1-NEXT: [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], -; AVX1-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], -; AVX1-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], -; AVX1-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], -; AVX1-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 -; AVX1-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4 -; AVX1-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 -; AVX1-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12 -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison) -; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison) -; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison) -; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP16]], +; AVX1-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[TMP17]], +; AVX1-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[TMP18]], +; AVX1-NEXT: [[TMP23:%.*]] = xor <4 x i1> [[TMP19]], +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 0 +; AVX1-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 4 +; AVX1-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 8 +; AVX1-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 12 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP20]], <4 x ptr> poison) +; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP21]], <4 x ptr> poison) +; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP22]], <4 x ptr> poison) +; AVX1-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP23]], <4 x ptr> poison) ; AVX1-NEXT: [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX1-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX1-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX1-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX1-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], -; AVX1-NEXT: [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], -; AVX1-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], -; AVX1-NEXT: [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], -; AVX1-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 -; AVX1-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4 -; AVX1-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 -; AVX1-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12 -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]]) -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]]) -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]]) -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]]) +; AVX1-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX1-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX1-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD7]], zeroinitializer +; AVX1-NEXT: [[TMP36:%.*]] = xor <4 x i1> [[TMP32]], +; AVX1-NEXT: [[TMP37:%.*]] = xor <4 x i1> [[TMP33]], +; AVX1-NEXT: [[TMP38:%.*]] = xor <4 x i1> [[TMP34]], +; AVX1-NEXT: [[TMP39:%.*]] = xor <4 x i1> [[TMP35]], +; AVX1-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP20]], <4 x i1> [[TMP36]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP21]], <4 x i1> [[TMP37]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP42:%.*]] = select <4 x i1> [[TMP22]], <4 x i1> [[TMP38]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP23]], <4 x i1> [[TMP39]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP44]], i32 0 +; AVX1-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP44]], i32 4 +; AVX1-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP44]], i32 8 +; AVX1-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP44]], i32 12 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP40]]) +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP41]]) +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP42]]) +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP43]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX1: vec.epilog.iter.check: +; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX1: vec.epilog.ph: +; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX1-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; AVX1-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]] +; AVX1-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AVX1: vec.epilog.vector.body: +; AVX1-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX1-NEXT: [[TMP53:%.*]] = add i64 [[INDEX10]], 0 +; AVX1-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP53]] +; AVX1-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0 +; AVX1-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP55]], align 1 +; AVX1-NEXT: [[TMP56:%.*]] = and <4 x i8> [[WIDE_LOAD11]], +; AVX1-NEXT: [[TMP57:%.*]] = icmp eq <4 x i8> [[TMP56]], zeroinitializer +; AVX1-NEXT: [[TMP58:%.*]] = xor <4 x i1> [[TMP57]], +; AVX1-NEXT: [[TMP59:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP53]] +; AVX1-NEXT: [[TMP60:%.*]] = getelementptr ptr, ptr [[TMP59]], i32 0 +; AVX1-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP60]], i32 8, <4 x i1> [[TMP58]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP61:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX1-NEXT: [[TMP62:%.*]] = xor <4 x i1> [[TMP61]], +; AVX1-NEXT: [[TMP63:%.*]] = select <4 x i1> [[TMP58]], <4 x i1> [[TMP62]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP64:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP53]] +; AVX1-NEXT: [[TMP65:%.*]] = getelementptr double, ptr [[TMP64]], i32 0 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP65]], i32 8, <4 x i1> [[TMP63]]) +; AVX1-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 +; AVX1-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] +; AVX1-NEXT: br i1 [[TMP66]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; AVX1: vec.epilog.middle.block: +; AVX1-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] +; AVX1-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX1: vec.epilog.scalar.ph: +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; AVX1-NEXT: [[TMP54:%.*]] = and i8 [[TMP53]], 1 -; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0 +; AVX1-NEXT: [[TMP67:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AVX1-NEXT: [[TMP68:%.*]] = and i8 [[TMP67]], 1 +; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP68]], 0 ; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX1: land.lhs.true: ; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 -; AVX1-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null +; AVX1-NEXT: [[TMP69:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 +; AVX1-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP69]], null ; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX1: if.then: ; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] @@ -1729,7 +1971,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -1738,11 +1980,14 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-LABEL: @foo7( ; AVX2-NEXT: entry: ; AVX2-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 -; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX2: for.body.preheader: +; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[ITER_CHECK:%.*]] +; AVX2: iter.check: ; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 -; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 -; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX2: vector.main.loop.iter.check: +; AVX2-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; AVX2-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] @@ -1762,77 +2007,110 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12 ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1 -; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 -; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 -; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 +; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; AVX2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 ; AVX2-NEXT: [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], -; AVX2-NEXT: [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD2]], -; AVX2-NEXT: [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD3]], +; AVX2-NEXT: [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD2]], +; AVX2-NEXT: [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD3]], +; AVX2-NEXT: [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD4]], ; AVX2-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[TMP12]], zeroinitializer ; AVX2-NEXT: [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer ; AVX2-NEXT: [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer ; AVX2-NEXT: [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer -; AVX2-NEXT: [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], -; AVX2-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], -; AVX2-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], -; AVX2-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 -; AVX2-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4 -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 -; AVX2-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12 -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison) -; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison) -; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison) -; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP16]], +; AVX2-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[TMP17]], +; AVX2-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[TMP18]], +; AVX2-NEXT: [[TMP23:%.*]] = xor <4 x i1> [[TMP19]], +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 0 +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 4 +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 8 +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 12 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP20]], <4 x ptr> poison) +; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP21]], <4 x ptr> poison) +; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP22]], <4 x ptr> poison) +; AVX2-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP23]], <4 x ptr> poison) ; AVX2-NEXT: [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX2-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX2-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX2-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX2-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], -; AVX2-NEXT: [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], -; AVX2-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], -; AVX2-NEXT: [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], -; AVX2-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 -; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4 -; AVX2-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 -; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12 -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]]) -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]]) -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]]) -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]]) +; AVX2-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX2-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX2-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD7]], zeroinitializer +; AVX2-NEXT: [[TMP36:%.*]] = xor <4 x i1> [[TMP32]], +; AVX2-NEXT: [[TMP37:%.*]] = xor <4 x i1> [[TMP33]], +; AVX2-NEXT: [[TMP38:%.*]] = xor <4 x i1> [[TMP34]], +; AVX2-NEXT: [[TMP39:%.*]] = xor <4 x i1> [[TMP35]], +; AVX2-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP20]], <4 x i1> [[TMP36]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP21]], <4 x i1> [[TMP37]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP42:%.*]] = select <4 x i1> [[TMP22]], <4 x i1> [[TMP38]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP23]], <4 x i1> [[TMP39]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP44]], i32 0 +; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP44]], i32 4 +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP44]], i32 8 +; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP44]], i32 12 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP40]]) +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP41]]) +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP42]]) +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP43]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX2: vec.epilog.iter.check: +; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX2: vec.epilog.ph: +; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX2-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; AVX2-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]] +; AVX2-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AVX2: vec.epilog.vector.body: +; AVX2-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX2-NEXT: [[TMP53:%.*]] = add i64 [[INDEX10]], 0 +; AVX2-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP53]] +; AVX2-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP55]], align 1 +; AVX2-NEXT: [[TMP56:%.*]] = and <4 x i8> [[WIDE_LOAD11]], +; AVX2-NEXT: [[TMP57:%.*]] = icmp eq <4 x i8> [[TMP56]], zeroinitializer +; AVX2-NEXT: [[TMP58:%.*]] = xor <4 x i1> [[TMP57]], +; AVX2-NEXT: [[TMP59:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP53]] +; AVX2-NEXT: [[TMP60:%.*]] = getelementptr ptr, ptr [[TMP59]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP60]], i32 8, <4 x i1> [[TMP58]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP61:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX2-NEXT: [[TMP62:%.*]] = xor <4 x i1> [[TMP61]], +; AVX2-NEXT: [[TMP63:%.*]] = select <4 x i1> [[TMP58]], <4 x i1> [[TMP62]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP64:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP53]] +; AVX2-NEXT: [[TMP65:%.*]] = getelementptr double, ptr [[TMP64]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP65]], i32 8, <4 x i1> [[TMP63]]) +; AVX2-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 +; AVX2-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] +; AVX2-NEXT: br i1 [[TMP66]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; AVX2: vec.epilog.middle.block: +; AVX2-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] +; AVX2-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX2: vec.epilog.scalar.ph: +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: -; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; AVX2-NEXT: [[TMP54:%.*]] = and i8 [[TMP53]], 1 -; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0 +; AVX2-NEXT: [[TMP67:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AVX2-NEXT: [[TMP68:%.*]] = and i8 [[TMP67]], 1 +; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP68]], 0 ; AVX2-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX2: land.lhs.true: ; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 -; AVX2-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null +; AVX2-NEXT: [[TMP69:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 +; AVX2-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP69]], null ; AVX2-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX2: if.then: ; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] @@ -1841,7 +2119,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: @@ -1850,11 +2128,14 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-LABEL: @foo7( ; AVX512-NEXT: entry: ; AVX512-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 -; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX512: for.body.preheader: +; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[ITER_CHECK:%.*]] +; AVX512: iter.check: ; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 -; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32 -; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX512: vector.main.loop.iter.check: +; AVX512-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32 ; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] @@ -1874,77 +2155,110 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 24 ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP8]], align 1 -; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1 -; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1 -; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP11]], align 1 +; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1 +; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1 +; AVX512-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP11]], align 1 ; AVX512-NEXT: [[TMP12:%.*]] = and <8 x i8> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP13:%.*]] = and <8 x i8> [[WIDE_LOAD1]], -; AVX512-NEXT: [[TMP14:%.*]] = and <8 x i8> [[WIDE_LOAD2]], -; AVX512-NEXT: [[TMP15:%.*]] = and <8 x i8> [[WIDE_LOAD3]], +; AVX512-NEXT: [[TMP13:%.*]] = and <8 x i8> [[WIDE_LOAD2]], +; AVX512-NEXT: [[TMP14:%.*]] = and <8 x i8> [[WIDE_LOAD3]], +; AVX512-NEXT: [[TMP15:%.*]] = and <8 x i8> [[WIDE_LOAD4]], ; AVX512-NEXT: [[TMP16:%.*]] = icmp eq <8 x i8> [[TMP12]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = icmp eq <8 x i8> [[TMP13]], zeroinitializer ; AVX512-NEXT: [[TMP18:%.*]] = icmp eq <8 x i8> [[TMP14]], zeroinitializer ; AVX512-NEXT: [[TMP19:%.*]] = icmp eq <8 x i8> [[TMP15]], zeroinitializer -; AVX512-NEXT: [[TMP24:%.*]] = xor <8 x i1> [[TMP16]], -; AVX512-NEXT: [[TMP25:%.*]] = xor <8 x i1> [[TMP17]], -; AVX512-NEXT: [[TMP26:%.*]] = xor <8 x i1> [[TMP18]], -; AVX512-NEXT: [[TMP27:%.*]] = xor <8 x i1> [[TMP19]], -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 16 -; AVX512-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 24 -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP24]], <8 x ptr> poison) -; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP25]], <8 x ptr> poison) -; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP26]], <8 x ptr> poison) -; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP27]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP20:%.*]] = xor <8 x i1> [[TMP16]], +; AVX512-NEXT: [[TMP21:%.*]] = xor <8 x i1> [[TMP17]], +; AVX512-NEXT: [[TMP22:%.*]] = xor <8 x i1> [[TMP18]], +; AVX512-NEXT: [[TMP23:%.*]] = xor <8 x i1> [[TMP19]], +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 0 +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 8 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 16 +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP20]], <8 x ptr> poison) +; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP21]], <8 x ptr> poison) +; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP22]], <8 x ptr> poison) +; AVX512-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP23]], <8 x ptr> poison) ; AVX512-NEXT: [[TMP32:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP33:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX512-NEXT: [[TMP34:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX512-NEXT: [[TMP35:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX512-NEXT: [[TMP40:%.*]] = xor <8 x i1> [[TMP32]], -; AVX512-NEXT: [[TMP41:%.*]] = xor <8 x i1> [[TMP33]], -; AVX512-NEXT: [[TMP42:%.*]] = xor <8 x i1> [[TMP34]], -; AVX512-NEXT: [[TMP43:%.*]] = xor <8 x i1> [[TMP35]], -; AVX512-NEXT: [[TMP44:%.*]] = select <8 x i1> [[TMP24]], <8 x i1> [[TMP40]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP45:%.*]] = select <8 x i1> [[TMP25]], <8 x i1> [[TMP41]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP46:%.*]] = select <8 x i1> [[TMP26]], <8 x i1> [[TMP42]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = select <8 x i1> [[TMP27]], <8 x i1> [[TMP43]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 16 -; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 24 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP48]], i32 8, <8 x i1> [[TMP44]]) -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP49]], i32 8, <8 x i1> [[TMP45]]) -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP50]], i32 8, <8 x i1> [[TMP46]]) -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP51]], i32 8, <8 x i1> [[TMP47]]) +; AVX512-NEXT: [[TMP33:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX512-NEXT: [[TMP34:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX512-NEXT: [[TMP35:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD7]], zeroinitializer +; AVX512-NEXT: [[TMP36:%.*]] = xor <8 x i1> [[TMP32]], +; AVX512-NEXT: [[TMP37:%.*]] = xor <8 x i1> [[TMP33]], +; AVX512-NEXT: [[TMP38:%.*]] = xor <8 x i1> [[TMP34]], +; AVX512-NEXT: [[TMP39:%.*]] = xor <8 x i1> [[TMP35]], +; AVX512-NEXT: [[TMP40:%.*]] = select <8 x i1> [[TMP20]], <8 x i1> [[TMP36]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP41:%.*]] = select <8 x i1> [[TMP21]], <8 x i1> [[TMP37]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP42:%.*]] = select <8 x i1> [[TMP22]], <8 x i1> [[TMP38]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP43:%.*]] = select <8 x i1> [[TMP23]], <8 x i1> [[TMP39]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP44]], i32 0 +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP44]], i32 8 +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP44]], i32 16 +; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP44]], i32 24 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP48]], i32 8, <8 x i1> [[TMP40]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP49]], i32 8, <8 x i1> [[TMP41]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP50]], i32 8, <8 x i1> [[TMP42]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP51]], i32 8, <8 x i1> [[TMP43]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX512: vec.epilog.iter.check: +; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX512: vec.epilog.ph: +; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX512-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; AVX512-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]] +; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AVX512: vec.epilog.vector.body: +; AVX512-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP53:%.*]] = add i64 [[INDEX10]], 0 +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP53]] +; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP55]], align 1 +; AVX512-NEXT: [[TMP56:%.*]] = and <8 x i8> [[WIDE_LOAD11]], +; AVX512-NEXT: [[TMP57:%.*]] = icmp eq <8 x i8> [[TMP56]], zeroinitializer +; AVX512-NEXT: [[TMP58:%.*]] = xor <8 x i1> [[TMP57]], +; AVX512-NEXT: [[TMP59:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP53]] +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr ptr, ptr [[TMP59]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP60]], i32 8, <8 x i1> [[TMP58]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP61:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX512-NEXT: [[TMP62:%.*]] = xor <8 x i1> [[TMP61]], +; AVX512-NEXT: [[TMP63:%.*]] = select <8 x i1> [[TMP58]], <8 x i1> [[TMP62]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP64:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP53]] +; AVX512-NEXT: [[TMP65:%.*]] = getelementptr double, ptr [[TMP64]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP65]], i32 8, <8 x i1> [[TMP63]]) +; AVX512-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 8 +; AVX512-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] +; AVX512-NEXT: br i1 [[TMP66]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]] +; AVX512: vec.epilog.middle.block: +; AVX512-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] +; AVX512-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX512: vec.epilog.scalar.ph: +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: -; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; AVX512-NEXT: [[TMP54:%.*]] = and i8 [[TMP53]], 1 -; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0 +; AVX512-NEXT: [[TMP67:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP68:%.*]] = and i8 [[TMP67]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP68]], 0 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX512: land.lhs.true: ; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 -; AVX512-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null +; AVX512-NEXT: [[TMP69:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP69]], null ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] @@ -1953,7 +2267,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: @@ -2007,11 +2321,14 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-LABEL: @foo8( ; AVX1-NEXT: entry: ; AVX1-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 -; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX1: for.body.preheader: +; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[ITER_CHECK:%.*]] +; AVX1: iter.check: ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 -; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 -; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX1: vector.main.loop.iter.check: +; AVX1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: ; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] @@ -2031,77 +2348,110 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8 ; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12 ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1 -; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 -; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 -; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 +; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; AVX1-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 ; AVX1-NEXT: [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], -; AVX1-NEXT: [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD2]], -; AVX1-NEXT: [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD3]], +; AVX1-NEXT: [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD2]], +; AVX1-NEXT: [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD3]], +; AVX1-NEXT: [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD4]], ; AVX1-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[TMP12]], zeroinitializer ; AVX1-NEXT: [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer ; AVX1-NEXT: [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer ; AVX1-NEXT: [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer -; AVX1-NEXT: [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], -; AVX1-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], -; AVX1-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], -; AVX1-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], -; AVX1-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 -; AVX1-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4 -; AVX1-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 -; AVX1-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12 -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison) -; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison) -; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison) -; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP16]], +; AVX1-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[TMP17]], +; AVX1-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[TMP18]], +; AVX1-NEXT: [[TMP23:%.*]] = xor <4 x i1> [[TMP19]], +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 0 +; AVX1-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 4 +; AVX1-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 8 +; AVX1-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 12 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP20]], <4 x ptr> poison) +; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP21]], <4 x ptr> poison) +; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP22]], <4 x ptr> poison) +; AVX1-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP23]], <4 x ptr> poison) ; AVX1-NEXT: [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX1-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX1-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX1-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX1-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], -; AVX1-NEXT: [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], -; AVX1-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], -; AVX1-NEXT: [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], -; AVX1-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 -; AVX1-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4 -; AVX1-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 -; AVX1-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12 -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]]) -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]]) -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]]) -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]]) +; AVX1-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX1-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX1-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD7]], zeroinitializer +; AVX1-NEXT: [[TMP36:%.*]] = xor <4 x i1> [[TMP32]], +; AVX1-NEXT: [[TMP37:%.*]] = xor <4 x i1> [[TMP33]], +; AVX1-NEXT: [[TMP38:%.*]] = xor <4 x i1> [[TMP34]], +; AVX1-NEXT: [[TMP39:%.*]] = xor <4 x i1> [[TMP35]], +; AVX1-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP20]], <4 x i1> [[TMP36]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP21]], <4 x i1> [[TMP37]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP42:%.*]] = select <4 x i1> [[TMP22]], <4 x i1> [[TMP38]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP23]], <4 x i1> [[TMP39]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP44]], i32 0 +; AVX1-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP44]], i32 4 +; AVX1-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP44]], i32 8 +; AVX1-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP44]], i32 12 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP40]]) +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP41]]) +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP42]]) +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP43]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; AVX1-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX1: vec.epilog.iter.check: +; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX1: vec.epilog.ph: +; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX1-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; AVX1-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]] +; AVX1-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AVX1: vec.epilog.vector.body: +; AVX1-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX1-NEXT: [[TMP53:%.*]] = add i64 [[INDEX10]], 0 +; AVX1-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP53]] +; AVX1-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0 +; AVX1-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP55]], align 1 +; AVX1-NEXT: [[TMP56:%.*]] = and <4 x i8> [[WIDE_LOAD11]], +; AVX1-NEXT: [[TMP57:%.*]] = icmp eq <4 x i8> [[TMP56]], zeroinitializer +; AVX1-NEXT: [[TMP58:%.*]] = xor <4 x i1> [[TMP57]], +; AVX1-NEXT: [[TMP59:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP53]] +; AVX1-NEXT: [[TMP60:%.*]] = getelementptr ptr, ptr [[TMP59]], i32 0 +; AVX1-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP60]], i32 8, <4 x i1> [[TMP58]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP61:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX1-NEXT: [[TMP62:%.*]] = xor <4 x i1> [[TMP61]], +; AVX1-NEXT: [[TMP63:%.*]] = select <4 x i1> [[TMP58]], <4 x i1> [[TMP62]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP64:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP53]] +; AVX1-NEXT: [[TMP65:%.*]] = getelementptr double, ptr [[TMP64]], i32 0 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP65]], i32 8, <4 x i1> [[TMP63]]) +; AVX1-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 +; AVX1-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] +; AVX1-NEXT: br i1 [[TMP66]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; AVX1: vec.epilog.middle.block: +; AVX1-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] +; AVX1-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX1: vec.epilog.scalar.ph: +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; AVX1-NEXT: [[TMP54:%.*]] = and i8 [[TMP53]], 1 -; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0 +; AVX1-NEXT: [[TMP67:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AVX1-NEXT: [[TMP68:%.*]] = and i8 [[TMP67]], 1 +; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP68]], 0 ; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX1: land.lhs.true: ; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 -; AVX1-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null +; AVX1-NEXT: [[TMP69:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 +; AVX1-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP69]], null ; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX1: if.then: ; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] @@ -2110,7 +2460,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -2119,11 +2469,14 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-LABEL: @foo8( ; AVX2-NEXT: entry: ; AVX2-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 -; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX2: for.body.preheader: +; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[ITER_CHECK:%.*]] +; AVX2: iter.check: ; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 -; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 -; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX2: vector.main.loop.iter.check: +; AVX2-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; AVX2-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] @@ -2143,77 +2496,110 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12 ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1 -; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 -; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 -; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 +; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; AVX2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 ; AVX2-NEXT: [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], -; AVX2-NEXT: [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD2]], -; AVX2-NEXT: [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD3]], +; AVX2-NEXT: [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD2]], +; AVX2-NEXT: [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD3]], +; AVX2-NEXT: [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD4]], ; AVX2-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[TMP12]], zeroinitializer ; AVX2-NEXT: [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer ; AVX2-NEXT: [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer ; AVX2-NEXT: [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer -; AVX2-NEXT: [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], -; AVX2-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], -; AVX2-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], -; AVX2-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 -; AVX2-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4 -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 -; AVX2-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12 -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison) -; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison) -; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison) -; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP16]], +; AVX2-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[TMP17]], +; AVX2-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[TMP18]], +; AVX2-NEXT: [[TMP23:%.*]] = xor <4 x i1> [[TMP19]], +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 0 +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 4 +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 8 +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 12 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP20]], <4 x ptr> poison) +; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP21]], <4 x ptr> poison) +; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP22]], <4 x ptr> poison) +; AVX2-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP23]], <4 x ptr> poison) ; AVX2-NEXT: [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX2-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX2-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX2-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX2-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], -; AVX2-NEXT: [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], -; AVX2-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], -; AVX2-NEXT: [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], -; AVX2-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 -; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4 -; AVX2-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 -; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12 -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]]) -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]]) -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]]) -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]]) +; AVX2-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX2-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX2-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD7]], zeroinitializer +; AVX2-NEXT: [[TMP36:%.*]] = xor <4 x i1> [[TMP32]], +; AVX2-NEXT: [[TMP37:%.*]] = xor <4 x i1> [[TMP33]], +; AVX2-NEXT: [[TMP38:%.*]] = xor <4 x i1> [[TMP34]], +; AVX2-NEXT: [[TMP39:%.*]] = xor <4 x i1> [[TMP35]], +; AVX2-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP20]], <4 x i1> [[TMP36]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP21]], <4 x i1> [[TMP37]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP42:%.*]] = select <4 x i1> [[TMP22]], <4 x i1> [[TMP38]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP23]], <4 x i1> [[TMP39]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP44]], i32 0 +; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP44]], i32 4 +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP44]], i32 8 +; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP44]], i32 12 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP40]]) +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP41]]) +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP42]]) +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP43]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX2: vec.epilog.iter.check: +; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX2: vec.epilog.ph: +; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX2-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; AVX2-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]] +; AVX2-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AVX2: vec.epilog.vector.body: +; AVX2-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX2-NEXT: [[TMP53:%.*]] = add i64 [[INDEX10]], 0 +; AVX2-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP53]] +; AVX2-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP55]], align 1 +; AVX2-NEXT: [[TMP56:%.*]] = and <4 x i8> [[WIDE_LOAD11]], +; AVX2-NEXT: [[TMP57:%.*]] = icmp eq <4 x i8> [[TMP56]], zeroinitializer +; AVX2-NEXT: [[TMP58:%.*]] = xor <4 x i1> [[TMP57]], +; AVX2-NEXT: [[TMP59:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP53]] +; AVX2-NEXT: [[TMP60:%.*]] = getelementptr ptr, ptr [[TMP59]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP60]], i32 8, <4 x i1> [[TMP58]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP61:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX2-NEXT: [[TMP62:%.*]] = xor <4 x i1> [[TMP61]], +; AVX2-NEXT: [[TMP63:%.*]] = select <4 x i1> [[TMP58]], <4 x i1> [[TMP62]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP64:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP53]] +; AVX2-NEXT: [[TMP65:%.*]] = getelementptr double, ptr [[TMP64]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP65]], i32 8, <4 x i1> [[TMP63]]) +; AVX2-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 +; AVX2-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] +; AVX2-NEXT: br i1 [[TMP66]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] +; AVX2: vec.epilog.middle.block: +; AVX2-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] +; AVX2-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX2: vec.epilog.scalar.ph: +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: -; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; AVX2-NEXT: [[TMP54:%.*]] = and i8 [[TMP53]], 1 -; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0 +; AVX2-NEXT: [[TMP67:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AVX2-NEXT: [[TMP68:%.*]] = and i8 [[TMP67]], 1 +; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP68]], 0 ; AVX2-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX2: land.lhs.true: ; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 -; AVX2-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null +; AVX2-NEXT: [[TMP69:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 +; AVX2-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP69]], null ; AVX2-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX2: if.then: ; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] @@ -2222,7 +2608,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: @@ -2231,11 +2617,14 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-LABEL: @foo8( ; AVX512-NEXT: entry: ; AVX512-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 -; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX512: for.body.preheader: +; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[ITER_CHECK:%.*]] +; AVX512: iter.check: ; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 -; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32 -; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX512: vector.main.loop.iter.check: +; AVX512-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32 ; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] @@ -2255,77 +2644,110 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 24 ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP8]], align 1 -; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1 -; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1 -; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP11]], align 1 +; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1 +; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1 +; AVX512-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP11]], align 1 ; AVX512-NEXT: [[TMP12:%.*]] = and <8 x i8> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP13:%.*]] = and <8 x i8> [[WIDE_LOAD1]], -; AVX512-NEXT: [[TMP14:%.*]] = and <8 x i8> [[WIDE_LOAD2]], -; AVX512-NEXT: [[TMP15:%.*]] = and <8 x i8> [[WIDE_LOAD3]], +; AVX512-NEXT: [[TMP13:%.*]] = and <8 x i8> [[WIDE_LOAD2]], +; AVX512-NEXT: [[TMP14:%.*]] = and <8 x i8> [[WIDE_LOAD3]], +; AVX512-NEXT: [[TMP15:%.*]] = and <8 x i8> [[WIDE_LOAD4]], ; AVX512-NEXT: [[TMP16:%.*]] = icmp eq <8 x i8> [[TMP12]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = icmp eq <8 x i8> [[TMP13]], zeroinitializer ; AVX512-NEXT: [[TMP18:%.*]] = icmp eq <8 x i8> [[TMP14]], zeroinitializer ; AVX512-NEXT: [[TMP19:%.*]] = icmp eq <8 x i8> [[TMP15]], zeroinitializer -; AVX512-NEXT: [[TMP24:%.*]] = xor <8 x i1> [[TMP16]], -; AVX512-NEXT: [[TMP25:%.*]] = xor <8 x i1> [[TMP17]], -; AVX512-NEXT: [[TMP26:%.*]] = xor <8 x i1> [[TMP18]], -; AVX512-NEXT: [[TMP27:%.*]] = xor <8 x i1> [[TMP19]], -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 16 -; AVX512-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 24 -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP24]], <8 x ptr> poison) -; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP25]], <8 x ptr> poison) -; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP26]], <8 x ptr> poison) -; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP27]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP20:%.*]] = xor <8 x i1> [[TMP16]], +; AVX512-NEXT: [[TMP21:%.*]] = xor <8 x i1> [[TMP17]], +; AVX512-NEXT: [[TMP22:%.*]] = xor <8 x i1> [[TMP18]], +; AVX512-NEXT: [[TMP23:%.*]] = xor <8 x i1> [[TMP19]], +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 0 +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 8 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 16 +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP24]], i32 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP20]], <8 x ptr> poison) +; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP21]], <8 x ptr> poison) +; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP22]], <8 x ptr> poison) +; AVX512-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP23]], <8 x ptr> poison) ; AVX512-NEXT: [[TMP32:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP33:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX512-NEXT: [[TMP34:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX512-NEXT: [[TMP35:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX512-NEXT: [[TMP40:%.*]] = xor <8 x i1> [[TMP32]], -; AVX512-NEXT: [[TMP41:%.*]] = xor <8 x i1> [[TMP33]], -; AVX512-NEXT: [[TMP42:%.*]] = xor <8 x i1> [[TMP34]], -; AVX512-NEXT: [[TMP43:%.*]] = xor <8 x i1> [[TMP35]], -; AVX512-NEXT: [[TMP44:%.*]] = select <8 x i1> [[TMP24]], <8 x i1> [[TMP40]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP45:%.*]] = select <8 x i1> [[TMP25]], <8 x i1> [[TMP41]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP46:%.*]] = select <8 x i1> [[TMP26]], <8 x i1> [[TMP42]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = select <8 x i1> [[TMP27]], <8 x i1> [[TMP43]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 16 -; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 24 -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP48]], i32 8, <8 x i1> [[TMP44]]) -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP49]], i32 8, <8 x i1> [[TMP45]]) -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP50]], i32 8, <8 x i1> [[TMP46]]) -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP51]], i32 8, <8 x i1> [[TMP47]]) +; AVX512-NEXT: [[TMP33:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX512-NEXT: [[TMP34:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX512-NEXT: [[TMP35:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD7]], zeroinitializer +; AVX512-NEXT: [[TMP36:%.*]] = xor <8 x i1> [[TMP32]], +; AVX512-NEXT: [[TMP37:%.*]] = xor <8 x i1> [[TMP33]], +; AVX512-NEXT: [[TMP38:%.*]] = xor <8 x i1> [[TMP34]], +; AVX512-NEXT: [[TMP39:%.*]] = xor <8 x i1> [[TMP35]], +; AVX512-NEXT: [[TMP40:%.*]] = select <8 x i1> [[TMP20]], <8 x i1> [[TMP36]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP41:%.*]] = select <8 x i1> [[TMP21]], <8 x i1> [[TMP37]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP42:%.*]] = select <8 x i1> [[TMP22]], <8 x i1> [[TMP38]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP43:%.*]] = select <8 x i1> [[TMP23]], <8 x i1> [[TMP39]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP44]], i32 0 +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP44]], i32 8 +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP44]], i32 16 +; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP44]], i32 24 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP48]], i32 8, <8 x i1> [[TMP40]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP49]], i32 8, <8 x i1> [[TMP41]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP50]], i32 8, <8 x i1> [[TMP42]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP51]], i32 8, <8 x i1> [[TMP43]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP53:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX512: vec.epilog.iter.check: +; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX512: vec.epilog.ph: +; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX512-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; AVX512-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]] +; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AVX512: vec.epilog.vector.body: +; AVX512-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP53:%.*]] = add i64 [[INDEX10]], 0 +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP53]] +; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP55]], align 1 +; AVX512-NEXT: [[TMP56:%.*]] = and <8 x i8> [[WIDE_LOAD11]], +; AVX512-NEXT: [[TMP57:%.*]] = icmp eq <8 x i8> [[TMP56]], zeroinitializer +; AVX512-NEXT: [[TMP58:%.*]] = xor <8 x i1> [[TMP57]], +; AVX512-NEXT: [[TMP59:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP53]] +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr ptr, ptr [[TMP59]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP60]], i32 8, <8 x i1> [[TMP58]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP61:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX512-NEXT: [[TMP62:%.*]] = xor <8 x i1> [[TMP61]], +; AVX512-NEXT: [[TMP63:%.*]] = select <8 x i1> [[TMP58]], <8 x i1> [[TMP62]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP64:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP53]] +; AVX512-NEXT: [[TMP65:%.*]] = getelementptr double, ptr [[TMP64]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP65]], i32 8, <8 x i1> [[TMP63]]) +; AVX512-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 8 +; AVX512-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] +; AVX512-NEXT: br i1 [[TMP66]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] +; AVX512: vec.epilog.middle.block: +; AVX512-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] +; AVX512-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX512: vec.epilog.scalar.ph: +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: -; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; AVX512-NEXT: [[TMP54:%.*]] = and i8 [[TMP53]], 1 -; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0 +; AVX512-NEXT: [[TMP67:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP68:%.*]] = and i8 [[TMP67]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP68]], 0 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX512: land.lhs.true: ; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 -; AVX512-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null +; AVX512-NEXT: [[TMP69:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP69]], null ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] @@ -2334,7 +2756,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP55:![0-9]+]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll index 8800fa26f067c..45d931864a85e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll @@ -8,12 +8,13 @@ target triple = "x86_64-unknown-linux-gnu" define void @foo(ptr addrspace(1) align 8 dereferenceable_or_null(16), ptr addrspace(1) align 8 dereferenceable_or_null(8), i64) #0 { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[PREHEADER:%.*]] -; CHECK: preheader: +; CHECK-NEXT: br label [[ITER_CHECK:%.*]] +; CHECK: iter.check: ; CHECK-NEXT: [[DOT10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 16 ; CHECK-NEXT: [[DOT12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 16 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2:%.*]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: [[UMAX2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2:%.*]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], 16 @@ -22,9 +23,12 @@ define void @foo(ptr addrspace(1) align 8 dereferenceable_or_null(16), ptr addrs ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr addrspace(1) [[DOT10]], [[SCEVGEP1]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr addrspace(1) [[DOT12]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -16 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[UMAX2]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -33,35 +37,55 @@ define void @foo(ptr addrspace(1) align 8 dereferenceable_or_null(16), ptr addrs ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP5]], i64 64 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP5]], i64 96 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP5]], align 8, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP6]], align 8, !alias.scope [[META0]] -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP7]], align 8, !alias.scope [[META0]] -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP8]], align 8, !alias.scope [[META0]] +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP6]], align 8, !alias.scope [[META0]] +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP7]], align 8, !alias.scope [[META0]] +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP8]], align 8, !alias.scope [[META0]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT10]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP9]], i64 32 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP9]], i64 64 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP9]], i64 96 ; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD]], ptr addrspace(1) [[TMP9]], align 8, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] -; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD3]], ptr addrspace(1) [[TMP10]], align 8, !alias.scope [[META3]], !noalias [[META0]] -; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD4]], ptr addrspace(1) [[TMP11]], align 8, !alias.scope [[META3]], !noalias [[META0]] -; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD5]], ptr addrspace(1) [[TMP12]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD4]], ptr addrspace(1) [[TMP10]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD5]], ptr addrspace(1) [[TMP11]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD6]], ptr addrspace(1) [[TMP12]], align 8, !alias.scope [[META3]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP2]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[UMAX2]], 12 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_VEC8:%.*]] = and i64 [[UMAX2]], -4 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT12]], i64 [[INDEX9]] +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP14]], align 8, !alias.scope [[META8:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT10]], i64 [[INDEX9]] +; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD10]], ptr addrspace(1) [[TMP15]], align 8, !alias.scope [[META11:![0-9]+]], !noalias [[META8]] +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX9]], 4 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[TMP16]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[N_VEC8]], [[TMP2]] +; CHECK-NEXT: br i1 [[CMP_N12]], label [[LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[DOT18:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT12]], i64 [[INDVARS_IV3]] ; CHECK-NEXT: [[V:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[DOT18]], align 8 ; CHECK-NEXT: [[DOT20:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT10]], i64 [[INDVARS_IV3]] ; CHECK-NEXT: store ptr addrspace(1) [[V]], ptr addrspace(1) [[DOT20]], align 8 ; CHECK-NEXT: [[INDVARS_IV_NEXT4]] = add nuw nsw i64 [[INDVARS_IV3]], 1 ; CHECK-NEXT: [[DOT21:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT4]], [[TMP2]] -; CHECK-NEXT: br i1 [[DOT21]], label [[LOOP]], label [[LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[DOT21]], label [[LOOP]], label [[LOOPEXIT]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: loopexit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll index 7cbf0ab025206..88ee42ba7cb6a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll @@ -179,11 +179,14 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-LABEL: @test_muladd( ; AVX1-NEXT: entry: ; AVX1-NEXT: [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; AVX1-NEXT: br i1 [[CMP30]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] -; AVX1: for.body.preheader: +; AVX1-NEXT: br i1 [[CMP30]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]] +; AVX1: iter.check: ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 -; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 -; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX1: vector.main.loop.iter.check: +; AVX1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: ; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] @@ -207,21 +210,21 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 ; AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 ; AVX1-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2 -; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 -; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP14]], align 2 -; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP15]], align 2 +; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 +; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP14]], align 2 +; AVX1-NEXT: [[WIDE_VEC4:%.*]] = load <8 x i16>, ptr [[TMP15]], align 2 ; AVX1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC4]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <8 x i16> [[WIDE_VEC4]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[TMP16:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> -; AVX1-NEXT: [[TMP17:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> -; AVX1-NEXT: [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32> -; AVX1-NEXT: [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32> +; AVX1-NEXT: [[TMP17:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32> +; AVX1-NEXT: [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32> +; AVX1-NEXT: [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32> ; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP4]] ; AVX1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP5]] ; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]] @@ -230,34 +233,34 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP21]], i32 0 ; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0 ; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[TMP23]], i32 0 -; AVX1-NEXT: [[WIDE_VEC11:%.*]] = load <8 x i16>, ptr [[TMP24]], align 2 -; AVX1-NEXT: [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP25]], align 2 -; AVX1-NEXT: [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP26]], align 2 -; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP27]], align 2 -; AVX1-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP24]], align 2 +; AVX1-NEXT: [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP25]], align 2 +; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP26]], align 2 +; AVX1-NEXT: [[WIDE_VEC15:%.*]] = load <8 x i16>, ptr [[TMP27]], align 2 ; AVX1-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC15]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC15]] to <4 x i32> -; AVX1-NEXT: [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC16]] to <4 x i32> -; AVX1-NEXT: [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32> -; AVX1-NEXT: [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC23:%.*]] = shufflevector <8 x i16> [[WIDE_VEC15]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC16]] to <4 x i32> +; AVX1-NEXT: [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32> +; AVX1-NEXT: [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32> +; AVX1-NEXT: [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32> ; AVX1-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP16]] ; AVX1-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP29]], [[TMP17]] ; AVX1-NEXT: [[TMP34:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP18]] ; AVX1-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP19]] -; AVX1-NEXT: [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32> -; AVX1-NEXT: [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> -; AVX1-NEXT: [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32> -; AVX1-NEXT: [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32> -; AVX1-NEXT: [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32> -; AVX1-NEXT: [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32> -; AVX1-NEXT: [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32> -; AVX1-NEXT: [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32> +; AVX1-NEXT: [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> +; AVX1-NEXT: [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32> +; AVX1-NEXT: [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32> +; AVX1-NEXT: [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC11]] to <4 x i32> +; AVX1-NEXT: [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32> +; AVX1-NEXT: [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32> +; AVX1-NEXT: [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32> +; AVX1-NEXT: [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC23]] to <4 x i32> ; AVX1-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP36]] ; AVX1-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP41]], [[TMP37]] ; AVX1-NEXT: [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP38]] @@ -283,34 +286,73 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX1: vec.epilog.iter.check: +; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX1: vec.epilog.ph: +; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX1-NEXT: [[N_MOD_VF24:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; AVX1-NEXT: [[N_VEC25:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF24]] +; AVX1-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AVX1: vec.epilog.vector.body: +; AVX1-NEXT: [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT33:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX1-NEXT: [[TMP61:%.*]] = add i64 [[INDEX26]], 0 +; AVX1-NEXT: [[TMP62:%.*]] = shl nuw nsw i64 [[TMP61]], 1 +; AVX1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP62]] +; AVX1-NEXT: [[TMP64:%.*]] = getelementptr inbounds i16, ptr [[TMP63]], i32 0 +; AVX1-NEXT: [[WIDE_VEC27:%.*]] = load <8 x i16>, ptr [[TMP64]], align 2 +; AVX1-NEXT: [[STRIDED_VEC28:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC29:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP65:%.*]] = sext <4 x i16> [[STRIDED_VEC28]] to <4 x i32> +; AVX1-NEXT: [[TMP66:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP62]] +; AVX1-NEXT: [[TMP67:%.*]] = getelementptr inbounds i16, ptr [[TMP66]], i32 0 +; AVX1-NEXT: [[WIDE_VEC30:%.*]] = load <8 x i16>, ptr [[TMP67]], align 2 +; AVX1-NEXT: [[STRIDED_VEC31:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC32:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP68:%.*]] = sext <4 x i16> [[STRIDED_VEC31]] to <4 x i32> +; AVX1-NEXT: [[TMP69:%.*]] = mul nsw <4 x i32> [[TMP68]], [[TMP65]] +; AVX1-NEXT: [[TMP70:%.*]] = sext <4 x i16> [[STRIDED_VEC29]] to <4 x i32> +; AVX1-NEXT: [[TMP71:%.*]] = sext <4 x i16> [[STRIDED_VEC32]] to <4 x i32> +; AVX1-NEXT: [[TMP72:%.*]] = mul nsw <4 x i32> [[TMP71]], [[TMP70]] +; AVX1-NEXT: [[TMP73:%.*]] = add nsw <4 x i32> [[TMP72]], [[TMP69]] +; AVX1-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP61]] +; AVX1-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[TMP74]], i32 0 +; AVX1-NEXT: store <4 x i32> [[TMP73]], ptr [[TMP75]], align 4 +; AVX1-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX26]], 4 +; AVX1-NEXT: [[TMP76:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC25]] +; AVX1-NEXT: br i1 [[TMP76]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX1: vec.epilog.middle.block: +; AVX1-NEXT: [[CMP_N34:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC25]] +; AVX1-NEXT: br i1 [[CMP_N34]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX1: vec.epilog.scalar.ph: +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; AVX1-NEXT: [[TMP61:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP61]] -; AVX1-NEXT: [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; AVX1-NEXT: [[CONV:%.*]] = sext i16 [[TMP62]] to i32 -; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP61]] -; AVX1-NEXT: [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 -; AVX1-NEXT: [[CONV5:%.*]] = sext i16 [[TMP63]] to i32 +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; AVX1-NEXT: [[TMP77:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP77]] +; AVX1-NEXT: [[TMP78:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; AVX1-NEXT: [[CONV:%.*]] = sext i16 [[TMP78]] to i32 +; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP77]] +; AVX1-NEXT: [[TMP79:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; AVX1-NEXT: [[CONV5:%.*]] = sext i16 [[TMP79]] to i32 ; AVX1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]] -; AVX1-NEXT: [[TMP64:%.*]] = or disjoint i64 [[TMP61]], 1 -; AVX1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP64]] -; AVX1-NEXT: [[TMP65:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 -; AVX1-NEXT: [[CONV11:%.*]] = sext i16 [[TMP65]] to i32 -; AVX1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP64]] -; AVX1-NEXT: [[TMP66:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 -; AVX1-NEXT: [[CONV16:%.*]] = sext i16 [[TMP66]] to i32 +; AVX1-NEXT: [[TMP80:%.*]] = or disjoint i64 [[TMP77]], 1 +; AVX1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP80]] +; AVX1-NEXT: [[TMP81:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 +; AVX1-NEXT: [[CONV11:%.*]] = sext i16 [[TMP81]] to i32 +; AVX1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP80]] +; AVX1-NEXT: [[TMP82:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 +; AVX1-NEXT: [[CONV16:%.*]] = sext i16 [[TMP82]] to i32 ; AVX1-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]] ; AVX1-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]] ; AVX1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]] ; AVX1-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll index 743ca20f92b49..324c9e9033993 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll @@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu" define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) local_unnamed_addr #0 { ; CHECK-LABEL: @japi1_vect_42283( -; CHECK-NEXT: top: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1:%.*]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = load atomic ptr, ptr @jlplt_ijl_alloc_array_1d_10294_got unordered, align 8 ; CHECK-NEXT: [[TMP4:%.*]] = tail call ptr addrspace(10) [[TMP3]](ptr addrspace(10) null, i64 0) @@ -19,8 +19,8 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo ; CHECK-NEXT: [[DOTELT1:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) [[TMP5]], i64 0, i32 1 ; CHECK-NEXT: [[DOTUNPACK2:%.*]] = load i64, ptr addrspace(10) [[DOTELT1]], align 8, !tbaa [[TBAA8]] ; CHECK-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP2]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP8]], 28 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP8]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]]) ; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0 @@ -38,56 +38,90 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo ; CHECK-NEXT: [[TMP15:%.*]] = icmp ult ptr addrspace(13) [[TMP14]], [[SCEVGEP]] ; CHECK-NEXT: [[TMP16:%.*]] = or i1 [[TMP15]], [[MUL_OVERFLOW3]] ; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]] -; CHECK-NEXT: br i1 [[TMP17]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK4:%.*]] = icmp ult i64 [[TMP8]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK4]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP8]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP8]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[STEP_ADD4:%.*]] = add <4 x i64> [[STEP_ADD]], -; CHECK-NEXT: [[STEP_ADD5:%.*]] = add <4 x i64> [[STEP_ADD4]], +; CHECK-NEXT: [[STEP_ADD5:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD6:%.*]] = add <4 x i64> [[STEP_ADD5]], ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND]], i32 0 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD6]], i32 0 ; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP18]], i32 8, <4 x i1> ), !tbaa [[TBAA10:![0-9]+]] ; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP19]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] ; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP20]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] ; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP21]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND]], i32 1 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 1 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 1 -; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP22]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP23]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP24]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP25]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD6]], i32 1 +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT9]], <4 x ptr addrspace(13)> [[TMP22]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT9]], <4 x ptr addrspace(13)> [[TMP23]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT9]], <4 x ptr addrspace(13)> [[TMP24]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT9]], <4 x ptr addrspace(13)> [[TMP25]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD5]], +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD6]], ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[L44:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TOP:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[L44:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP8]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF10:%.*]] = urem i64 [[TMP8]], 4 +; CHECK-NEXT: [[N_VEC11:%.*]] = sub i64 [[TMP8]], [[N_MOD_VF10]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT17]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT19]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX13:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND14:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND14]], i32 0 +; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT18]], <4 x ptr addrspace(13)> [[TMP27]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND14]], i32 1 +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT20]], <4 x ptr addrspace(13)> [[TMP28]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: [[INDEX_NEXT21]] = add nuw i64 [[INDEX13]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT16]] = add <4 x i64> [[VEC_IND14]], +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT21]], [[N_VEC11]] +; CHECK-NEXT: br i1 [[TMP29]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N22:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC11]] +; CHECK-NEXT: br i1 [[CMP_N22]], label [[L44]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi i64 [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[L26:%.*]] ; CHECK: L26: -; CHECK-NEXT: [[VALUE_PHI5:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP27:%.*]], [[L26]] ] +; CHECK-NEXT: [[VALUE_PHI5:%.*]] = phi i64 [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP30:%.*]], [[L26]] ] ; CHECK-NEXT: [[DOTREPACK:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], i64 [[VALUE_PHI5]], i32 0 ; CHECK-NEXT: store ptr addrspace(10) [[DOTUNPACK]], ptr addrspace(13) [[DOTREPACK]], align 8, !tbaa [[TBAA10]] ; CHECK-NEXT: [[DOTREPACK4:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], i64 [[VALUE_PHI5]], i32 1 ; CHECK-NEXT: store i64 [[DOTUNPACK2]], ptr addrspace(13) [[DOTREPACK4]], align 8, !tbaa [[TBAA10]] -; CHECK-NEXT: [[TMP27]] = add i64 [[VALUE_PHI5]], 1 +; CHECK-NEXT: [[TMP30]] = add i64 [[VALUE_PHI5]], 1 ; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[VALUE_PHI5]], [[TMP2]] -; CHECK-NEXT: br i1 [[DOTNOT]], label [[L44]], label [[L26]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[DOTNOT]], label [[L44]], label [[L26]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: L44: ; CHECK-NEXT: ret ptr addrspace(10) null ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll index 9e6fb277a9dd7..8a6818c45d650 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -11,10 +11,12 @@ target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: norecurse nounwind readonly uwtable define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 { ; CHECK-LABEL: @matrix_row_col( -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64 ; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[J:%.*]] to i64 -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -183,33 +185,73 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <8 x i32> [[TMP146]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <8 x i32> [[TMP147]], [[BIN_RDX7]] ; CHECK-NEXT: [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX8]]) -; CHECK-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP150:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP150]], [[VEC_EPILOG_PH]] ], [ [[TMP171:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP151:%.*]] = add i64 [[INDEX9]], 0 +; CHECK-NEXT: [[TMP152:%.*]] = add i64 [[INDEX9]], 1 +; CHECK-NEXT: [[TMP153:%.*]] = add i64 [[INDEX9]], 2 +; CHECK-NEXT: [[TMP154:%.*]] = add i64 [[INDEX9]], 3 +; CHECK-NEXT: [[TMP155:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP151]] +; CHECK-NEXT: [[TMP156:%.*]] = getelementptr inbounds i32, ptr [[TMP155]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP156]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP157:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP151]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP158:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP152]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP159:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP153]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP160:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP154]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP161:%.*]] = load i32, ptr [[TMP157]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP162:%.*]] = load i32, ptr [[TMP158]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP163:%.*]] = load i32, ptr [[TMP159]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP164:%.*]] = load i32, ptr [[TMP160]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP165:%.*]] = insertelement <4 x i32> poison, i32 [[TMP161]], i32 0 +; CHECK-NEXT: [[TMP166:%.*]] = insertelement <4 x i32> [[TMP165]], i32 [[TMP162]], i32 1 +; CHECK-NEXT: [[TMP167:%.*]] = insertelement <4 x i32> [[TMP166]], i32 [[TMP163]], i32 2 +; CHECK-NEXT: [[TMP168:%.*]] = insertelement <4 x i32> [[TMP167]], i32 [[TMP164]], i32 3 +; CHECK-NEXT: [[TMP169:%.*]] = mul nsw <4 x i32> [[TMP168]], [[WIDE_LOAD11]] +; CHECK-NEXT: [[TMP170:%.*]] = add <4 x i32> [[VEC_PHI10]], +; CHECK-NEXT: [[TMP171]] = add <4 x i32> [[TMP170]], [[TMP169]] +; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 4 +; CHECK-NEXT: [[TMP172:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 100 +; CHECK-NEXT: br i1 [[TMP172]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP173:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP171]]) +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP173]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ [[TMP173]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ADD7_LCSSA]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP174:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP151:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP151]], [[TMP150]] +; CHECK-NEXT: [[TMP175:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP175]], [[TMP174]] ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SUM_015]], 4 ; CHECK-NEXT: [[ADD7]] = add i32 [[ADD]], [[MUL]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; ; MAX-BW-LABEL: @matrix_row_col( -; MAX-BW-NEXT: entry: +; MAX-BW-NEXT: iter.check: ; MAX-BW-NEXT: [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64 ; MAX-BW-NEXT: [[IDXPROM5:%.*]] = sext i32 [[J:%.*]] to i64 -; MAX-BW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; MAX-BW-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; MAX-BW: vector.main.loop.iter.check: +; MAX-BW-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; MAX-BW: vector.ph: ; MAX-BW-NEXT: br label [[VECTOR_BODY:%.*]] ; MAX-BW: vector.body: @@ -378,27 +420,65 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; MAX-BW-NEXT: [[BIN_RDX7:%.*]] = add <8 x i32> [[TMP146]], [[BIN_RDX]] ; MAX-BW-NEXT: [[BIN_RDX8:%.*]] = add <8 x i32> [[TMP147]], [[BIN_RDX7]] ; MAX-BW-NEXT: [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX8]]) -; MAX-BW-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; MAX-BW: scalar.ph: -; MAX-BW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-BW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; MAX-BW-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; MAX-BW: vec.epilog.iter.check: +; MAX-BW-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; MAX-BW: vec.epilog.ph: +; MAX-BW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; MAX-BW-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; MAX-BW-NEXT: [[TMP150:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 +; MAX-BW-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; MAX-BW: vec.epilog.vector.body: +; MAX-BW-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; MAX-BW-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP150]], [[VEC_EPILOG_PH]] ], [ [[TMP171:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; MAX-BW-NEXT: [[TMP151:%.*]] = add i64 [[INDEX9]], 0 +; MAX-BW-NEXT: [[TMP152:%.*]] = add i64 [[INDEX9]], 1 +; MAX-BW-NEXT: [[TMP153:%.*]] = add i64 [[INDEX9]], 2 +; MAX-BW-NEXT: [[TMP154:%.*]] = add i64 [[INDEX9]], 3 +; MAX-BW-NEXT: [[TMP155:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP151]] +; MAX-BW-NEXT: [[TMP156:%.*]] = getelementptr inbounds i32, ptr [[TMP155]], i32 0 +; MAX-BW-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP156]], align 4, !tbaa [[TBAA1]] +; MAX-BW-NEXT: [[TMP157:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP151]], i64 [[IDXPROM5]] +; MAX-BW-NEXT: [[TMP158:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP152]], i64 [[IDXPROM5]] +; MAX-BW-NEXT: [[TMP159:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP153]], i64 [[IDXPROM5]] +; MAX-BW-NEXT: [[TMP160:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP154]], i64 [[IDXPROM5]] +; MAX-BW-NEXT: [[TMP161:%.*]] = load i32, ptr [[TMP157]], align 4, !tbaa [[TBAA1]] +; MAX-BW-NEXT: [[TMP162:%.*]] = load i32, ptr [[TMP158]], align 4, !tbaa [[TBAA1]] +; MAX-BW-NEXT: [[TMP163:%.*]] = load i32, ptr [[TMP159]], align 4, !tbaa [[TBAA1]] +; MAX-BW-NEXT: [[TMP164:%.*]] = load i32, ptr [[TMP160]], align 4, !tbaa [[TBAA1]] +; MAX-BW-NEXT: [[TMP165:%.*]] = insertelement <4 x i32> poison, i32 [[TMP161]], i32 0 +; MAX-BW-NEXT: [[TMP166:%.*]] = insertelement <4 x i32> [[TMP165]], i32 [[TMP162]], i32 1 +; MAX-BW-NEXT: [[TMP167:%.*]] = insertelement <4 x i32> [[TMP166]], i32 [[TMP163]], i32 2 +; MAX-BW-NEXT: [[TMP168:%.*]] = insertelement <4 x i32> [[TMP167]], i32 [[TMP164]], i32 3 +; MAX-BW-NEXT: [[TMP169:%.*]] = mul nsw <4 x i32> [[TMP168]], [[WIDE_LOAD11]] +; MAX-BW-NEXT: [[TMP170:%.*]] = add <4 x i32> [[VEC_PHI10]], +; MAX-BW-NEXT: [[TMP171]] = add <4 x i32> [[TMP170]], [[TMP169]] +; MAX-BW-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 4 +; MAX-BW-NEXT: [[TMP172:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 100 +; MAX-BW-NEXT: br i1 [[TMP172]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; MAX-BW: vec.epilog.middle.block: +; MAX-BW-NEXT: [[TMP173:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP171]]) +; MAX-BW-NEXT: br i1 true, label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_SCALAR_PH]] +; MAX-BW: vec.epilog.scalar.ph: +; MAX-BW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; MAX-BW-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP173]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ] ; MAX-BW-NEXT: br label [[FOR_BODY:%.*]] ; MAX-BW: for.cond.cleanup: -; MAX-BW-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ] +; MAX-BW-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ [[TMP173]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; MAX-BW-NEXT: ret i32 [[ADD7_LCSSA]] ; MAX-BW: for.body: -; MAX-BW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; MAX-BW-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ] +; MAX-BW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; MAX-BW-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ] ; MAX-BW-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]] -; MAX-BW-NEXT: [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]] +; MAX-BW-NEXT: [[TMP174:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]] -; MAX-BW-NEXT: [[TMP151:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]] -; MAX-BW-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP151]], [[TMP150]] +; MAX-BW-NEXT: [[TMP175:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]] +; MAX-BW-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP175]], [[TMP174]] ; MAX-BW-NEXT: [[ADD:%.*]] = add i32 [[SUM_015]], 4 ; MAX-BW-NEXT: [[ADD7]] = add i32 [[ADD]], [[MUL]] ; MAX-BW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; MAX-BW-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; MAX-BW-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; MAX-BW-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; entry: %idxprom = sext i32 %i to i64 @@ -483,7 +563,7 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; CHECK-NEXT: store i8 [[TMP35]], ptr [[TMP27]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 -; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -503,7 +583,7 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; CHECK-NEXT: store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; @@ -604,7 +684,7 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; MAX-BW-NEXT: store i8 [[TMP67]], ptr [[TMP51]], align 1 ; MAX-BW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; MAX-BW-NEXT: [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 -; MAX-BW-NEXT: br i1 [[TMP68]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; MAX-BW-NEXT: br i1 [[TMP68]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; MAX-BW: middle.block: ; MAX-BW-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; MAX-BW: scalar.ph: @@ -624,7 +704,7 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; MAX-BW-NEXT: store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1 ; MAX-BW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2 ; MAX-BW-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024 -; MAX-BW-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP10:![0-9]+]] +; MAX-BW-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]] ; MAX-BW: for.cond.cleanup: ; MAX-BW-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll index daa35d31f2e0c..044b86c1384c0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -20,8 +20,10 @@ target triple = "x86_64-unknown-linux" ; define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: @vectorized( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -66,21 +68,42 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX8]], 0 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP27]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP29]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP30:%.*]] = fadd fast <4 x float> [[WIDE_LOAD9]], [[WIDE_LOAD10]] +; CHECK-NEXT: store <4 x float> [[TMP30]], ptr [[TMP29]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 4 +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 20 +; CHECK-NEXT: br i1 [[TMP31]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP32]], [[TMP33]] ; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -126,15 +149,15 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP6:![0-9]+]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7:![0-9]+]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -143,14 +166,14 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -191,15 +214,15 @@ define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -208,14 +231,14 @@ define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]] -; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll index 70b002f766b75..966d7e3cded0a 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll @@ -41,7 +41,7 @@ define void @apply_delta(ptr nocapture noundef %dst, ptr nocapture noundef reado ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[DST_ADDR_130]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR8]] = getelementptr inbounds i8, ptr [[SRC_ADDR_129]], i64 1 ; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i64 [[DEC]], 0 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END9]], label [[WHILE_BODY4]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END9]], label [[WHILE_BODY4]] ; CHECK: while.end9: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll index 87d3900765490..4e267ae61aaa7 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll @@ -15,16 +15,19 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-LABEL: @vdiv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body.preheader: +; CHECK-NEXT: br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]] +; CHECK: iter.check: ; CHECK-NEXT: [[X4:%.*]] = ptrtoint ptr [[X:%.*]] to i64 ; CHECK-NEXT: [[Y5:%.*]] = ptrtoint ptr [[Y:%.*]] to i64 ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[X4]], [[Y5]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128 ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[DIFF_CHECK]] -; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_BODY_PREHEADER9:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK6:%.*]] = icmp ult i32 [[N]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK6]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483632 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0 @@ -41,13 +44,13 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 64 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 96 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP5]], align 8, !tbaa [[TBAA3:![0-9]+]] -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP6]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x double>, ptr [[TMP7]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x double>, ptr [[TMP8]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x double>, ptr [[TMP6]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x double>, ptr [[TMP7]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x double>, ptr [[TMP8]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7]], [[TMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7]], [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8]], [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x double> [[WIDE_LOAD9]], [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i64 32 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i64 64 @@ -61,94 +64,119 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER9]] -; CHECK: for.body.preheader9: -; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 12 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY_PREHEADER]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_VEC11:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483644 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT15]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX12]] +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x double>, ptr [[TMP19]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: [[TMP20:%.*]] = fmul fast <4 x double> [[WIDE_LOAD13]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX12]] +; CHECK-NEXT: store <4 x double> [[TMP20]], ptr [[TMP21]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX12]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC11]] +; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N17:%.*]] = icmp eq i64 [[N_VEC11]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[CMP_N17]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP23:%.*]] = sub nsw i64 [[WIDE_TRIP_COUNT]], [[INDVARS_IV_PH]] +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP23]], 7 ; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 ; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL_PREHEADER:%.*]] ; CHECK: for.body.prol.preheader: -; CHECK-NEXT: [[TMP18:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP24:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: br label [[FOR_BODY_PROL:%.*]] ; CHECK: for.body.prol: ; CHECK-NEXT: [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_BODY_PROL]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PROL_PREHEADER]] ] ; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_NEXT:%.*]], [[FOR_BODY_PROL]] ], [ 0, [[FOR_BODY_PROL_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_PROL]] ; CHECK-NEXT: [[T0_PROL:%.*]] = load double, ptr [[ARRAYIDX_PROL]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP19:%.*]] = fmul fast double [[T0_PROL]], [[TMP18]] +; CHECK-NEXT: [[TMP25:%.*]] = fmul fast double [[T0_PROL]], [[TMP24]] ; CHECK-NEXT: [[ARRAYIDX2_PROL:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_PROL]] -; CHECK-NEXT: store double [[TMP19]], ptr [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP25]], ptr [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1 ; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1 ; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]] -; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.body.prol.loopexit: -; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER9]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] -; CHECK-NEXT: [[TMP20:%.*]] = sub nsw i64 [[INDVARS_IV_PH]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp ugt i64 [[TMP20]], -8 -; CHECK-NEXT: br i1 [[TMP21]], label [[FOR_END]], label [[FOR_BODY_PREHEADER9_NEW:%.*]] -; CHECK: for.body.preheader9.new: -; CHECK-NEXT: [[TMP22:%.*]] = fdiv fast double 1.000000e+00, [[A]] -; CHECK-NEXT: [[TMP23:%.*]] = fdiv fast double 1.000000e+00, [[A]] -; CHECK-NEXT: [[TMP24:%.*]] = fdiv fast double 1.000000e+00, [[A]] -; CHECK-NEXT: [[TMP25:%.*]] = fdiv fast double 1.000000e+00, [[A]] -; CHECK-NEXT: [[TMP26:%.*]] = fdiv fast double 1.000000e+00, [[A]] -; CHECK-NEXT: [[TMP27:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] +; CHECK-NEXT: [[TMP26:%.*]] = sub nsw i64 [[INDVARS_IV_PH]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp ugt i64 [[TMP26]], -8 +; CHECK-NEXT: br i1 [[TMP27]], label [[FOR_END]], label [[FOR_BODY_PREHEADER_NEW:%.*]] +; CHECK: for.body.preheader.new: ; CHECK-NEXT: [[TMP28:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP29:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP30:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP31:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP32:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP33:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP34:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP35:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER9_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[T0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP30:%.*]] = fmul fast double [[T0]], [[TMP22]] +; CHECK-NEXT: [[TMP36:%.*]] = fmul fast double [[T0]], [[TMP28]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store double [[TMP30]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP36]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[T0_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP31:%.*]] = fmul fast double [[T0_1]], [[TMP23]] +; CHECK-NEXT: [[TMP37:%.*]] = fmul fast double [[T0_1]], [[TMP29]] ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: store double [[TMP31]], ptr [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP37]], ptr [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_1]] ; CHECK-NEXT: [[T0_2:%.*]] = load double, ptr [[ARRAYIDX_2]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP32:%.*]] = fmul fast double [[T0_2]], [[TMP24]] +; CHECK-NEXT: [[TMP38:%.*]] = fmul fast double [[T0_2]], [[TMP30]] ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_1]] -; CHECK-NEXT: store double [[TMP32]], ptr [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP38]], ptr [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_2]] ; CHECK-NEXT: [[T0_3:%.*]] = load double, ptr [[ARRAYIDX_3]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP33:%.*]] = fmul fast double [[T0_3]], [[TMP25]] +; CHECK-NEXT: [[TMP39:%.*]] = fmul fast double [[T0_3]], [[TMP31]] ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_2]] -; CHECK-NEXT: store double [[TMP33]], ptr [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP39]], ptr [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4 ; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_3]] ; CHECK-NEXT: [[T0_4:%.*]] = load double, ptr [[ARRAYIDX_4]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP34:%.*]] = fmul fast double [[T0_4]], [[TMP26]] +; CHECK-NEXT: [[TMP40:%.*]] = fmul fast double [[T0_4]], [[TMP32]] ; CHECK-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_3]] -; CHECK-NEXT: store double [[TMP34]], ptr [[ARRAYIDX2_4]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP40]], ptr [[ARRAYIDX2_4]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5 ; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_4]] ; CHECK-NEXT: [[T0_5:%.*]] = load double, ptr [[ARRAYIDX_5]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP35:%.*]] = fmul fast double [[T0_5]], [[TMP27]] +; CHECK-NEXT: [[TMP41:%.*]] = fmul fast double [[T0_5]], [[TMP33]] ; CHECK-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_4]] -; CHECK-NEXT: store double [[TMP35]], ptr [[ARRAYIDX2_5]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP41]], ptr [[ARRAYIDX2_5]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6 ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_5]] ; CHECK-NEXT: [[T0_6:%.*]] = load double, ptr [[ARRAYIDX_6]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP36:%.*]] = fmul fast double [[T0_6]], [[TMP28]] +; CHECK-NEXT: [[TMP42:%.*]] = fmul fast double [[T0_6]], [[TMP34]] ; CHECK-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_5]] -; CHECK-NEXT: store double [[TMP36]], ptr [[ARRAYIDX2_6]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP42]], ptr [[ARRAYIDX2_6]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7 ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_6]] ; CHECK-NEXT: [[T0_7:%.*]] = load double, ptr [[ARRAYIDX_7]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP37:%.*]] = fmul fast double [[T0_7]], [[TMP29]] +; CHECK-NEXT: [[TMP43:%.*]] = fmul fast double [[T0_7]], [[TMP35]] ; CHECK-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_6]] -; CHECK-NEXT: store double [[TMP37]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP43]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8 ; CHECK-NEXT: [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT_7]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_7]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ;