diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index cefce93f9e25c..79c386384d5c4 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1244,6 +1244,8 @@ class TargetTransformInfo { /// and the number of execution units in the CPU. unsigned getMaxInterleaveFactor(ElementCount VF) const; + ElementCount getMaxPredicateLength(ElementCount VF) const; + /// Collect properties of V used in cost analysis, e.g. OP_PowerOf2. static OperandValueInfo getOperandInfo(const Value *V); @@ -2002,6 +2004,9 @@ class TargetTransformInfo::Concept { virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0; virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0; + + virtual ElementCount getMaxPredicateLength(ElementCount VF) const = 0; + virtual InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, OperandValueInfo Opd1Info, OperandValueInfo Opd2Info, @@ -2627,6 +2632,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { unsigned getMaxInterleaveFactor(ElementCount VF) override { return Impl.getMaxInterleaveFactor(VF); } + + ElementCount getMaxPredicateLength(ElementCount VF) const override { + return Impl.getMaxPredicateLength(VF); + } + unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 9a57331d281db..44812222bb57e 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -537,6 +537,8 @@ class TargetTransformInfoImplBase { unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; } + ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; } + InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 2091432d4fe27..c2d2db250ef21 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -890,6 +890,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; } + ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; } + InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None}, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 82b6d7e7c4833..383bc2fcb2a99 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -821,6 +821,10 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const { return TTIImpl->getMaxInterleaveFactor(VF); } +ElementCount TargetTransformInfo::getMaxPredicateLength(ElementCount VF) const { + return TTIImpl->getMaxPredicateLength(VF); +} + TargetTransformInfo::OperandValueInfo TargetTransformInfo::getOperandInfo(const Value *V) { OperandValueKind OpInfo = OK_AnyValue; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index f49c73dc79519..ff5036538589c 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3383,6 +3383,15 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) { return ST->getMaxInterleaveFactor(); } +ElementCount AArch64TTIImpl::getMaxPredicateLength(ElementCount VF) const { + // Do not create masks bigger than ``. + unsigned N = ST->hasSVE() ? 16 : 0; + // Do not create masks that are more than twice the VF. + N = std::min(N, 2 * VF.getKnownMinValue()); + return VF.isScalable() ? ElementCount::getScalable(N) + : ElementCount::getFixed(N); +} + // For Falkor, we want to avoid having too many strided loads in a loop since // that can exhaust the HW prefetcher resources. We adjust the unroller // MaxCount preference below to attempt to ensure unrolling doesn't create too diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 2f44aaa3e26ab..6f2af4eaf40d6 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -157,6 +157,8 @@ class AArch64TTIImpl : public BasicTTIImplBase { unsigned getMaxInterleaveFactor(ElementCount VF); + ElementCount getMaxPredicateLength(ElementCount VF) const; + bool prefersVectorizedAddressing() const; InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index c03c278fcebe7..f07bdea90c0f7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -202,6 +202,14 @@ class VPBuilder { VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL = {}, const Twine &Name = ""); + VPValue *createGetActiveLaneMask(VPValue *IV, VPValue *TC, DebugLoc DL, + const Twine &Name = "") { + auto *ALM = new VPActiveLaneMaskRecipe(IV, TC, DL, Name); + if (BB) + BB->insert(ALM, InsertPt); + return ALM; + } + //===--------------------------------------------------------------------===// // RAII helpers. //===--------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a9ee9f62197e1..cb8786fe32b44 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -588,6 +588,10 @@ class InnerLoopVectorizer { /// count of the original loop for both main loop and epilogue vectorization. void setTripCount(Value *TC) { TripCount = TC; } + ElementCount getMaxPredicateLength(ElementCount VF) const { + return TTI->getMaxPredicateLength(VF); + } + protected: friend class LoopVectorizationPlanner; @@ -7509,7 +7513,8 @@ LoopVectorizationPlanner::executePlan( LLVM_DEBUG(BestVPlan.dump()); // Perform the actual loop transformation. - VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan, + VPTransformState State(BestVF, BestUF, TTI.getMaxPredicateLength(BestVF), LI, + DT, ILV.Builder, &ILV, &BestVPlan, OrigLoop->getHeader()->getContext()); // 0. Generate SCEV-dependent code into the preheader, including TripCount, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index f17be451e6846..dfe25f54130f2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -215,12 +215,13 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { return It; } -VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, +VPTransformState::VPTransformState(ElementCount VF, unsigned UF, + ElementCount MaxPred, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx) - : VF(VF), UF(UF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan), - LVer(nullptr), + : VF(VF), UF(UF), MaxPred(MaxPred), CFG(DT), LI(LI), Builder(Builder), + ILV(ILV), Plan(Plan), LVer(nullptr), TypeAnalysis(Plan->getCanonicalIV()->getScalarType(), Ctx) {} Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index d6b63a5d43c46..06f77433af6b8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -235,13 +235,14 @@ struct VPIteration { /// VPTransformState holds information passed down when "executing" a VPlan, /// needed for generating the output IR. struct VPTransformState { - VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, - DominatorTree *DT, IRBuilderBase &Builder, + VPTransformState(ElementCount VF, unsigned UF, ElementCount MaxPred, + LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx); /// The chosen Vectorization and Unroll Factors of the loop being vectorized. ElementCount VF; unsigned UF; + ElementCount MaxPred; /// Hold the indices to generate specific scalar instructions. Null indicates /// that all instances are to be generated, using either scalar or vector @@ -1174,7 +1175,6 @@ class VPInstruction : public VPRecipeWithIRFlags { Not, SLPLoad, SLPStore, - ActiveLaneMask, ExplicitVectorLength, CalculateTripCountMinusVF, // Increment the canonical IV separately for each unrolled part. @@ -1329,6 +1329,50 @@ class VPInstruction : public VPRecipeWithIRFlags { } }; +class VPActiveLaneMaskRecipe : public VPRecipeWithIRFlags { + const std::string Name; + +public: + VPActiveLaneMaskRecipe(VPValue *IV, VPValue *TC, DebugLoc DL = {}, + const Twine &Name = "") + : VPRecipeWithIRFlags(VPDef::VPActiveLaneMaskSC, + std::initializer_list{IV, TC}, DL), + Name(Name.str()) {} + + VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskSC) + + VPActiveLaneMaskRecipe *clone() override { + SmallVector Operands(operands()); + assert(Operands.size() == 2 && "by construction"); + auto *New = new VPActiveLaneMaskRecipe(Operands[0], Operands[1], + getDebugLoc(), Name); + New->transferFlags(*this); + return New; + } + + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + + return true; + } + + bool onlyFirstPartUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + + return false; + } +}; + /// VPWidenRecipe is a recipe for producing a copy of vector type its /// ingredient. This recipe covers most of the traditional vectorization cases /// where each ingredient transforms into a vectorized version of itself. diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 0587468807435..5cd18ea7ba2c7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -221,11 +221,6 @@ m_BranchOnCond(const Op0_t &Op0) { return m_VPInstruction(Op0); } -template -inline BinaryVPInstruction_match -m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) { - return m_VPInstruction(Op0, Op1); -} template inline BinaryVPInstruction_match @@ -296,6 +291,35 @@ inline BinaryVPInstruction_match m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) { return m_VPInstruction(Op0, Op1); } + +template +struct VPActiveLaneMask_match { + Op0_t Op0; + Op1_t Op1; + + VPActiveLaneMask_match(Op0_t Op0, Op1_t Op1) : Op0(Op0), Op1(Op1) {} + + bool match(const VPValue *V) { + auto *DefR = V->getDefiningRecipe(); + return DefR && match(DefR); + } + + bool match(const VPRecipeBase *R) { + auto *DefR = dyn_cast(R); + if (!DefR) + return false; + assert(DefR->getNumOperands() == 2 && + "recipe with matched opcode does not have 2 operands"); + return Op0.match(DefR->getOperand(0)) && Op1.match(DefR->getOperand(1)); + } +}; + +template +inline VPActiveLaneMask_match +m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) { + return {Op0, Op1}; +} + } // namespace VPlanPatternMatch } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 5eb99ffd1e10e..a648fb99dd2d1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -351,24 +351,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { Value *Op2 = State.get(getOperand(2), Part); return Builder.CreateSelect(Cond, Op1, Op2, Name); } - case VPInstruction::ActiveLaneMask: { - // Get first lane of vector induction variable. - Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); - // Get the original loop tripcount. - Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); - // If this part of the active lane mask is scalar, generate the CMP directly - // to avoid unnecessary extracts. - if (State.VF.isScalar()) - return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, - Name); - - auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = VectorType::get(Int1Ty, State.VF); - return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, - {PredTy, ScalarTC->getType()}, - {VIVElem0, ScalarTC}, nullptr, Name); - } case VPInstruction::FirstOrderRecurrenceSplice: { // Generate code to combine the previous and current values in vector v3. // @@ -636,7 +619,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case VPInstruction::PtrAdd: // TODO: Cover additional opcodes. return vputils::onlyFirstLaneUsed(this); - case VPInstruction::ActiveLaneMask: case VPInstruction::ExplicitVectorLength: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: @@ -671,9 +653,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::SLPStore: O << "combined store"; break; - case VPInstruction::ActiveLaneMask: - O << "active lane mask"; - break; case VPInstruction::ExplicitVectorLength: O << "EXPLICIT-VECTOR-LENGTH"; break; @@ -713,8 +692,94 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, DL.print(O); } } + +void VPActiveLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + + printAsOperand(O, SlotTracker); + O << " = active lane mask"; + printFlags(O); + printOperands(O, SlotTracker); + + if (auto DL = getDebugLoc()) { + O << ", !dbg "; + DL.print(O); + } +} + #endif +void VPActiveLaneMaskRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "VPInstruction executing an Instance"); + + IRBuilderBase &Builder = State.Builder; + Builder.SetCurrentDebugLocation(getDebugLoc()); + + // If this the active lane mask is scalar, generate the CMP directly + // to avoid unnecessary extracts. + if (State.VF.isScalar()) { + for (int Part = State.UF - 1; Part >= 0; --Part) { + // Get first lane of vector induction variable. + Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); + // Get the original loop tripcount. + Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0)); + + Value *V = Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, + ScalarTC, Name); + State.set(this, V, Part); + } + return; + } + + auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); + auto *PredTy = VectorType::get(Int1Ty, State.VF); + + unsigned MaxPred = std::min(State.MaxPred.getKnownMinValue(), + State.UF * State.VF.getKnownMinValue()); + if (State.UF <= 1 || MaxPred <= State.VF.getKnownMinValue() || + MaxPred % State.VF.getKnownMinValue() != 0) { + for (int Part = State.UF - 1; Part >= 0; --Part) { + // Get first lane of vector induction variable. + Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); + // Get the original loop tripcount. + Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0)); + Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, + {PredTy, ScalarTC->getType()}, + {VIVElem0, ScalarTC}, nullptr, Name); + State.set(this, V, Part); + } + return; + } + + // Generate long active lane masks covering all the unrolled iterations. + unsigned PartsPerMask = MaxPred / State.VF.getKnownMinValue(); + auto *LongPredTy = VectorType::get(Int1Ty, MaxPred, State.VF.isScalable()); + SmallVector LongMask(State.UF / PartsPerMask, nullptr); + for (int Part = State.UF - PartsPerMask; Part >= 0; Part -= PartsPerMask) { + // Get first lane of vector induction variable. + Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); + // Get the original loop tripcount. + Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0)); + Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, + {LongPredTy, ScalarTC->getType()}, + {VIVElem0, ScalarTC}, nullptr, Name); + LongMask[Part / PartsPerMask] = V; + } + + for (int Part = State.UF - 1; Part >= 0; --Part) { + Value *ALM = LongMask[Part / PartsPerMask]; + const unsigned I = Part % PartsPerMask; + Value *V = Builder.CreateIntrinsic( + Intrinsic::vector_extract, {PredTy, ALM->getType()}, + {ALM, ConstantInt::get(Type::getInt64Ty(Builder.getContext()), + I * State.VF.getKnownMinValue())}, + nullptr, Name); + + State.set(this, V, Part); + } +} + void VPWidenCallRecipe::execute(VPTransformState &State) { assert(State.VF.isVector() && "not widening"); Function *CalledScalarFn = getCalledScalarFunction(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 422579ea8b84f..061956f0e30ec 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1186,9 +1186,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "index.part.next"); // Create the active lane mask instruction in the VPlan preheader. - auto *EntryALM = - Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, - DL, "active.lane.mask.entry"); + auto *EntryALM = Builder.createGetActiveLaneMask(EntryIncrement, TC, DL, + "active.lane.mask.entry"); // Now create the ActiveLaneMaskPhi recipe in the main loop using the // preheader ActiveLaneMask instruction. @@ -1202,9 +1201,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( auto *InLoopIncrement = Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, {IncrementValue}, {false, false}, DL); - auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, - {InLoopIncrement, TripCount}, DL, - "active.lane.mask.next"); + auto *ALM = Builder.createGetActiveLaneMask(InLoopIncrement, TripCount, DL, + "active.lane.mask.next"); LaneMaskPhi->addOperand(ALM); // Replace the original terminator with BranchOnCond. We have to invert the @@ -1278,15 +1276,15 @@ void VPlanTransforms::addActiveLaneMask( "Must have widened canonical IV when tail folding!"); auto *WideCanonicalIV = cast(*FoundWidenCanonicalIVUser); - VPSingleDefRecipe *LaneMask; + VPValue *LaneMask; if (UseActiveLaneMaskForControlFlow) { LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( Plan, DataAndControlFlowWithoutRuntimeCheck); } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); - LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, - {WideCanonicalIV, Plan.getTripCount()}, nullptr, - "active.lane.mask"); + LaneMask = B.createGetActiveLaneMask(WideCanonicalIV, Plan.getTripCount(), + WideCanonicalIV->getDebugLoc(), + "active.lane.mask"); } // Walk users of WideCanonicalIV and replace all compares of the form diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 8d945f6f2b8ea..b90070579e5e8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -358,6 +358,7 @@ class VPDef { VPWidenSC, VPWidenSelectSC, VPBlendSC, + VPActiveLaneMaskSC, // START: Phi-like recipes. Need to be kept together. VPWidenPHISC, VPPredInstPHISC, diff --git a/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll b/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll new file mode 100644 index 0000000000000..866185cac6a0a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll @@ -0,0 +1,1069 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefix CHECK-SVE +; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1 + +target triple = "aarch64-unknown-linux" + +; Byte elements, UF=0 +define void @f_b_0(ptr noalias %dst, ptr readonly %src, i32 %n) #0 { +; CHECK-SVE-LABEL: f_b_0: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: cmp w2, #1 +; CHECK-SVE-NEXT: b.lt .LBB0_3 +; CHECK-SVE-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE-NEXT: mov w9, w2 +; CHECK-SVE-NEXT: rdvl x10, #1 +; CHECK-SVE-NEXT: mov x8, xzr +; CHECK-SVE-NEXT: whilelo p0.b, xzr, x9 +; CHECK-SVE-NEXT: .p2align 5, , 16 +; CHECK-SVE-NEXT: .LBB0_2: // %vector.body +; CHECK-SVE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE-NEXT: ld1b { z0.b }, p0/z, [x1, x8] +; CHECK-SVE-NEXT: mul z0.b, z0.b, #3 +; CHECK-SVE-NEXT: st1b { z0.b }, p0, [x0, x8] +; CHECK-SVE-NEXT: add x8, x8, x10 +; CHECK-SVE-NEXT: whilelo p0.b, x8, x9 +; CHECK-SVE-NEXT: b.mi .LBB0_2 +; CHECK-SVE-NEXT: .LBB0_3: // %for.cond.cleanup +; CHECK-SVE-NEXT: ret +; +; CHECK-SVE2p1-LABEL: f_b_0: +; CHECK-SVE2p1: // %bb.0: // %entry +; CHECK-SVE2p1-NEXT: cmp w2, #1 +; CHECK-SVE2p1-NEXT: b.lt .LBB0_3 +; CHECK-SVE2p1-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2p1-NEXT: mov w9, w2 +; CHECK-SVE2p1-NEXT: mov x8, xzr +; CHECK-SVE2p1-NEXT: whilelo p0.b, xzr, x9 +; CHECK-SVE2p1-NEXT: .p2align 5, , 16 +; CHECK-SVE2p1-NEXT: .LBB0_2: // %vector.body +; CHECK-SVE2p1-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2p1-NEXT: ld1b { z0.b }, p0/z, [x1, x8] +; CHECK-SVE2p1-NEXT: mul z0.b, z0.b, #3 +; CHECK-SVE2p1-NEXT: st1b { z0.b }, p0, [x0, x8] +; CHECK-SVE2p1-NEXT: addvl x8, x8, #1 +; CHECK-SVE2p1-NEXT: whilelo p0.b, x8, x9 +; CHECK-SVE2p1-NEXT: b.mi .LBB0_2 +; CHECK-SVE2p1-NEXT: .LBB0_3: // %for.cond.cleanup +; CHECK-SVE2p1-NEXT: ret +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 4 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %2 = getelementptr inbounds i8, ptr %src, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %2, i32 1, %active.lane.mask, poison) + %3 = mul %wide.masked.load, shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) + %4 = getelementptr inbounds i8, ptr %dst, i64 %index + tail call void @llvm.masked.store.nxv16i8.p0( %3, ptr %4, i32 1, %active.lane.mask) + %index.next = add i64 %index, %1 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) + %5 = extractelement %active.lane.mask.next, i64 0 + br i1 %5, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; Halfword elements, UF=0 +define void @f_h_0(ptr noalias %dst, ptr %src, i32 %n) #0 { +; CHECK-SVE-LABEL: f_h_0: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: cmp w2, #1 +; CHECK-SVE-NEXT: b.lt .LBB1_3 +; CHECK-SVE-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE-NEXT: mov w9, w2 +; CHECK-SVE-NEXT: cnth x10 +; CHECK-SVE-NEXT: mov x8, xzr +; CHECK-SVE-NEXT: whilelo p0.h, xzr, x9 +; CHECK-SVE-NEXT: .p2align 5, , 16 +; CHECK-SVE-NEXT: .LBB1_2: // %vector.body +; CHECK-SVE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; CHECK-SVE-NEXT: mul z0.h, z0.h, #3 +; CHECK-SVE-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; CHECK-SVE-NEXT: add x8, x8, x10 +; CHECK-SVE-NEXT: whilelo p0.h, x8, x9 +; CHECK-SVE-NEXT: b.mi .LBB1_2 +; CHECK-SVE-NEXT: .LBB1_3: // %for.cond.cleanup +; CHECK-SVE-NEXT: ret +; +; CHECK-SVE2p1-LABEL: f_h_0: +; CHECK-SVE2p1: // %bb.0: // %entry +; CHECK-SVE2p1-NEXT: cmp w2, #1 +; CHECK-SVE2p1-NEXT: b.lt .LBB1_3 +; CHECK-SVE2p1-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2p1-NEXT: mov w9, w2 +; CHECK-SVE2p1-NEXT: mov x8, xzr +; CHECK-SVE2p1-NEXT: whilelo p0.h, xzr, x9 +; CHECK-SVE2p1-NEXT: .p2align 5, , 16 +; CHECK-SVE2p1-NEXT: .LBB1_2: // %vector.body +; CHECK-SVE2p1-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2p1-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; CHECK-SVE2p1-NEXT: mul z0.h, z0.h, #3 +; CHECK-SVE2p1-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; CHECK-SVE2p1-NEXT: inch x8 +; CHECK-SVE2p1-NEXT: whilelo p0.h, x8, x9 +; CHECK-SVE2p1-NEXT: b.mi .LBB1_2 +; CHECK-SVE2p1-NEXT: .LBB1_3: // %for.cond.cleanup +; CHECK-SVE2p1-NEXT: ret +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 3 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count) + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %2 = getelementptr inbounds i16, ptr %src, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv8i16.p0(ptr %2, i32 2, %active.lane.mask, poison) + %3 = mul %wide.masked.load, shufflevector ( insertelement ( poison, i16 3, i64 0), poison, zeroinitializer) + %4 = getelementptr inbounds i16, ptr %dst, i64 %index + tail call void @llvm.masked.store.nxv8i16.p0( %3, ptr %4, i32 2, %active.lane.mask) + %index.next = add i64 %index, %1 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count) + %5 = extractelement %active.lane.mask.next, i64 0 + br i1 %5, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; Double-precision float elements, UF=0 +define void @f_d_0(ptr noalias %dst, ptr readonly %src, i32 %n) #0 { +; CHECK-SVE-LABEL: f_d_0: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: cmp w2, #1 +; CHECK-SVE-NEXT: b.lt .LBB2_3 +; CHECK-SVE-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE-NEXT: mov w9, w2 +; CHECK-SVE-NEXT: fmov z0.d, #3.00000000 +; CHECK-SVE-NEXT: cntd x10 +; CHECK-SVE-NEXT: mov x8, xzr +; CHECK-SVE-NEXT: whilelo p0.d, xzr, x9 +; CHECK-SVE-NEXT: .p2align 5, , 16 +; CHECK-SVE-NEXT: .LBB2_2: // %vector.body +; CHECK-SVE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; CHECK-SVE-NEXT: fmul z1.d, z1.d, z0.d +; CHECK-SVE-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] +; CHECK-SVE-NEXT: add x8, x8, x10 +; CHECK-SVE-NEXT: whilelo p0.d, x8, x9 +; CHECK-SVE-NEXT: b.mi .LBB2_2 +; CHECK-SVE-NEXT: .LBB2_3: // %for.cond.cleanup +; CHECK-SVE-NEXT: ret +; +; CHECK-SVE2p1-LABEL: f_d_0: +; CHECK-SVE2p1: // %bb.0: // %entry +; CHECK-SVE2p1-NEXT: cmp w2, #1 +; CHECK-SVE2p1-NEXT: b.lt .LBB2_3 +; CHECK-SVE2p1-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2p1-NEXT: mov w9, w2 +; CHECK-SVE2p1-NEXT: fmov z0.d, #3.00000000 +; CHECK-SVE2p1-NEXT: mov x8, xzr +; CHECK-SVE2p1-NEXT: whilelo p0.d, xzr, x9 +; CHECK-SVE2p1-NEXT: .p2align 5, , 16 +; CHECK-SVE2p1-NEXT: .LBB2_2: // %vector.body +; CHECK-SVE2p1-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2p1-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; CHECK-SVE2p1-NEXT: fmul z1.d, z1.d, z0.d +; CHECK-SVE2p1-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] +; CHECK-SVE2p1-NEXT: incd x8 +; CHECK-SVE2p1-NEXT: whilelo p0.d, x8, x9 +; CHECK-SVE2p1-NEXT: b.mi .LBB2_2 +; CHECK-SVE2p1-NEXT: .LBB2_3: // %for.cond.cleanup +; CHECK-SVE2p1-NEXT: ret +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 1 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count) + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %2 = getelementptr inbounds double, ptr %src, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv2f64.p0(ptr %2, i32 8, %active.lane.mask, poison) + %3 = fmul %wide.masked.load, shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) + %4 = getelementptr inbounds double, ptr %dst, i64 %index + tail call void @llvm.masked.store.nxv2f64.p0( %3, ptr %4, i32 8, %active.lane.mask) + %index.next = add i64 %index, %1 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count) + %5 = extractelement %active.lane.mask.next, i64 0 + br i1 %5, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; Byte elements, UF=2 +define void @f_b_2(ptr noalias %dst, ptr readonly %src, i32 %n) #0 { +; CHECK-SVE-LABEL: f_b_2: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: cmp w2, #1 +; CHECK-SVE-NEXT: b.lt .LBB3_3 +; CHECK-SVE-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE-NEXT: rdvl x11, #1 +; CHECK-SVE-NEXT: mov w8, w2 +; CHECK-SVE-NEXT: mov x9, xzr +; CHECK-SVE-NEXT: add x10, x0, x11 +; CHECK-SVE-NEXT: whilelo p0.b, x11, x8 +; CHECK-SVE-NEXT: add x11, x1, x11 +; CHECK-SVE-NEXT: rdvl x12, #2 +; CHECK-SVE-NEXT: rdvl x13, #3 +; CHECK-SVE-NEXT: whilelo p1.b, xzr, x8 +; CHECK-SVE-NEXT: .p2align 5, , 16 +; CHECK-SVE-NEXT: .LBB3_2: // %vector.body +; CHECK-SVE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE-NEXT: ld1b { z0.b }, p1/z, [x1, x9] +; CHECK-SVE-NEXT: ld1b { z1.b }, p0/z, [x11, x9] +; CHECK-SVE-NEXT: add x14, x13, x9 +; CHECK-SVE-NEXT: mul z0.b, z0.b, #3 +; CHECK-SVE-NEXT: mul z1.b, z1.b, #3 +; CHECK-SVE-NEXT: st1b { z0.b }, p1, [x0, x9] +; CHECK-SVE-NEXT: st1b { z1.b }, p0, [x10, x9] +; CHECK-SVE-NEXT: add x9, x12, x9 +; CHECK-SVE-NEXT: whilelo p0.b, x14, x8 +; CHECK-SVE-NEXT: whilelo p1.b, x9, x8 +; CHECK-SVE-NEXT: b.mi .LBB3_2 +; CHECK-SVE-NEXT: .LBB3_3: // %for.cond.cleanup +; CHECK-SVE-NEXT: ret +; +; CHECK-SVE2p1-LABEL: f_b_2: +; CHECK-SVE2p1: // %bb.0: // %entry +; CHECK-SVE2p1-NEXT: cmp w2, #1 +; CHECK-SVE2p1-NEXT: b.lt .LBB3_3 +; CHECK-SVE2p1-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2p1-NEXT: rdvl x10, #1 +; CHECK-SVE2p1-NEXT: mov w8, w2 +; CHECK-SVE2p1-NEXT: mov x9, xzr +; CHECK-SVE2p1-NEXT: addvl x11, x1, #1 +; CHECK-SVE2p1-NEXT: whilelo p1.b, xzr, x8 +; CHECK-SVE2p1-NEXT: whilelo p0.b, x10, x8 +; CHECK-SVE2p1-NEXT: addvl x10, x0, #1 +; CHECK-SVE2p1-NEXT: .p2align 5, , 16 +; CHECK-SVE2p1-NEXT: .LBB3_2: // %vector.body +; CHECK-SVE2p1-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2p1-NEXT: ld1b { z0.b }, p1/z, [x1, x9] +; CHECK-SVE2p1-NEXT: ld1b { z1.b }, p0/z, [x11, x9] +; CHECK-SVE2p1-NEXT: addvl x12, x9, #3 +; CHECK-SVE2p1-NEXT: mul z0.b, z0.b, #3 +; CHECK-SVE2p1-NEXT: mul z1.b, z1.b, #3 +; CHECK-SVE2p1-NEXT: st1b { z0.b }, p1, [x0, x9] +; CHECK-SVE2p1-NEXT: st1b { z1.b }, p0, [x10, x9] +; CHECK-SVE2p1-NEXT: addvl x9, x9, #2 +; CHECK-SVE2p1-NEXT: whilelo p0.b, x12, x8 +; CHECK-SVE2p1-NEXT: whilelo p1.b, x9, x8 +; CHECK-SVE2p1-NEXT: b.mi .LBB3_2 +; CHECK-SVE2p1-NEXT: .LBB3_3: // %for.cond.cleanup +; CHECK-SVE2p1-NEXT: ret +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 5 + %2 = tail call i64 @llvm.vscale.i64() + %3 = shl nuw nsw i64 %2, 4 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %3, i64 %wide.trip.count) + %active.lane.mask.entry10 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) + %4 = tail call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 4 + %6 = tail call i64 @llvm.vscale.i64() + %7 = shl nuw nsw i64 %6, 4 + %8 = tail call i64 @llvm.vscale.i64() + %9 = shl nuw nsw i64 %8, 4 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next13, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry10, %for.body.preheader ], [ %active.lane.mask.next14, %vector.body ] + %active.lane.mask11 = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %10 = getelementptr inbounds i8, ptr %src, i64 %index + %11 = getelementptr inbounds i8, ptr %10, i64 %5 + %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %10, i32 1, %active.lane.mask, poison) + %wide.masked.load12 = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull %11, i32 1, %active.lane.mask11, poison) + %12 = mul %wide.masked.load, shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) + %13 = mul %wide.masked.load12, shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) + %14 = getelementptr inbounds i8, ptr %dst, i64 %index + %15 = getelementptr inbounds i8, ptr %14, i64 %7 + tail call void @llvm.masked.store.nxv16i8.p0( %12, ptr %14, i32 1, %active.lane.mask) + tail call void @llvm.masked.store.nxv16i8.p0( %13, ptr %15, i32 1, %active.lane.mask11) + %index.next = add i64 %index, %1 + %index.next13 = add i64 %index, %1 + %16 = add i64 %index.next, %9 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %16, i64 %wide.trip.count) + %active.lane.mask.next14 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) + %17 = extractelement %active.lane.mask.next14, i64 0 + br i1 %17, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; Halfword elements, UF=2 +define void @f_h_2(ptr noalias %dst, ptr readonly %src, i32 %n) #0 { +; CHECK-SVE-LABEL: f_h_2: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: cmp w2, #1 +; CHECK-SVE-NEXT: b.lt .LBB4_3 +; CHECK-SVE-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE-NEXT: mov w9, w2 +; CHECK-SVE-NEXT: rdvl x10, #1 +; CHECK-SVE-NEXT: mov x8, xzr +; CHECK-SVE-NEXT: add x11, x0, x10 +; CHECK-SVE-NEXT: whilelo p1.b, xzr, x9 +; CHECK-SVE-NEXT: add x12, x1, x10 +; CHECK-SVE-NEXT: punpkhi p0.h, p1.b +; CHECK-SVE-NEXT: punpklo p1.h, p1.b +; CHECK-SVE-NEXT: .p2align 5, , 16 +; CHECK-SVE-NEXT: .LBB4_2: // %vector.body +; CHECK-SVE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE-NEXT: ld1h { z0.h }, p1/z, [x1, x8, lsl #1] +; CHECK-SVE-NEXT: ld1h { z1.h }, p0/z, [x12, x8, lsl #1] +; CHECK-SVE-NEXT: mul z0.h, z0.h, #3 +; CHECK-SVE-NEXT: mul z1.h, z1.h, #3 +; CHECK-SVE-NEXT: st1h { z0.h }, p1, [x0, x8, lsl #1] +; CHECK-SVE-NEXT: st1h { z1.h }, p0, [x11, x8, lsl #1] +; CHECK-SVE-NEXT: add x8, x10, x8 +; CHECK-SVE-NEXT: whilelo p1.b, x8, x9 +; CHECK-SVE-NEXT: punpkhi p0.h, p1.b +; CHECK-SVE-NEXT: punpklo p1.h, p1.b +; CHECK-SVE-NEXT: mov z0.h, p1/z, #1 // =0x1 +; CHECK-SVE-NEXT: fmov w13, s0 +; CHECK-SVE-NEXT: tbnz w13, #0, .LBB4_2 +; CHECK-SVE-NEXT: .LBB4_3: // %for.cond.cleanup +; CHECK-SVE-NEXT: ret +; +; CHECK-SVE2p1-LABEL: f_h_2: +; CHECK-SVE2p1: // %bb.0: // %entry +; CHECK-SVE2p1-NEXT: cmp w2, #1 +; CHECK-SVE2p1-NEXT: b.lt .LBB4_3 +; CHECK-SVE2p1-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2p1-NEXT: mov w9, w2 +; CHECK-SVE2p1-NEXT: addvl x10, x0, #1 +; CHECK-SVE2p1-NEXT: mov x8, xzr +; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, xzr, x9 +; CHECK-SVE2p1-NEXT: addvl x11, x1, #1 +; CHECK-SVE2p1-NEXT: .p2align 5, , 16 +; CHECK-SVE2p1-NEXT: .LBB4_2: // %vector.body +; CHECK-SVE2p1-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2p1-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; CHECK-SVE2p1-NEXT: ld1h { z1.h }, p1/z, [x11, x8, lsl #1] +; CHECK-SVE2p1-NEXT: mul z0.h, z0.h, #3 +; CHECK-SVE2p1-NEXT: mul z1.h, z1.h, #3 +; CHECK-SVE2p1-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; CHECK-SVE2p1-NEXT: st1h { z1.h }, p1, [x10, x8, lsl #1] +; CHECK-SVE2p1-NEXT: addvl x8, x8, #1 +; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, x8, x9 +; CHECK-SVE2p1-NEXT: mov z0.h, p0/z, #1 // =0x1 +; CHECK-SVE2p1-NEXT: fmov w12, s0 +; CHECK-SVE2p1-NEXT: tbnz w12, #0, .LBB4_2 +; CHECK-SVE2p1-NEXT: .LBB4_3: // %for.cond.cleanup +; CHECK-SVE2p1-NEXT: ret +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 4 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) + %active.lane.mask.entry10 = tail call @llvm.vector.extract.nxv8i1.nxv16i1( %active.lane.mask.entry, i64 8) + %active.lane.mask.entry11 = tail call @llvm.vector.extract.nxv8i1.nxv16i1( %active.lane.mask.entry, i64 0) + %2 = tail call i64 @llvm.vscale.i64() + %3 = shl nuw nsw i64 %2, 3 + %4 = tail call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 3 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next14, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry11, %for.body.preheader ], [ %active.lane.mask.next16, %vector.body ] + %active.lane.mask12 = phi [ %active.lane.mask.entry10, %for.body.preheader ], [ %active.lane.mask.next15, %vector.body ] + %6 = getelementptr inbounds i16, ptr %src, i64 %index + %7 = getelementptr inbounds i16, ptr %6, i64 %3 + %wide.masked.load = tail call @llvm.masked.load.nxv8i16.p0(ptr %6, i32 2, %active.lane.mask, poison) + %wide.masked.load13 = tail call @llvm.masked.load.nxv8i16.p0(ptr nonnull %7, i32 2, %active.lane.mask12, poison) + %8 = mul %wide.masked.load, shufflevector ( insertelement ( poison, i16 3, i64 0), poison, zeroinitializer) + %9 = mul %wide.masked.load13, shufflevector ( insertelement ( poison, i16 3, i64 0), poison, zeroinitializer) + %10 = getelementptr inbounds i16, ptr %dst, i64 %index + %11 = getelementptr inbounds i16, ptr %10, i64 %5 + tail call void @llvm.masked.store.nxv8i16.p0( %8, ptr %10, i32 2, %active.lane.mask) + tail call void @llvm.masked.store.nxv8i16.p0( %9, ptr %11, i32 2, %active.lane.mask12) + %index.next = add i64 %index, %1 + %index.next14 = add i64 %index, %1 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) + %active.lane.mask.next15 = tail call @llvm.vector.extract.nxv8i1.nxv16i1( %active.lane.mask.next, i64 8) + %active.lane.mask.next16 = tail call @llvm.vector.extract.nxv8i1.nxv16i1( %active.lane.mask.next, i64 0) + %12 = extractelement %active.lane.mask.next16, i64 0 + br i1 %12, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; Double-precision float elements, UF=2 +define void @f3(ptr noalias %dst, ptr readonly %src, i32 %n) #0 { +; CHECK-SVE-LABEL: f3: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: cmp w2, #1 +; CHECK-SVE-NEXT: b.lt .LBB5_3 +; CHECK-SVE-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE-NEXT: mov w9, w2 +; CHECK-SVE-NEXT: rdvl x11, #1 +; CHECK-SVE-NEXT: fmov z0.d, #3.00000000 +; CHECK-SVE-NEXT: mov x8, xzr +; CHECK-SVE-NEXT: add x10, x0, x11 +; CHECK-SVE-NEXT: whilelo p1.s, xzr, x9 +; CHECK-SVE-NEXT: add x11, x1, x11 +; CHECK-SVE-NEXT: cntw x12 +; CHECK-SVE-NEXT: punpkhi p0.h, p1.b +; CHECK-SVE-NEXT: punpklo p1.h, p1.b +; CHECK-SVE-NEXT: .p2align 5, , 16 +; CHECK-SVE-NEXT: .LBB5_2: // %vector.body +; CHECK-SVE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; CHECK-SVE-NEXT: ld1d { z2.d }, p0/z, [x11, x8, lsl #3] +; CHECK-SVE-NEXT: fmul z1.d, z1.d, z0.d +; CHECK-SVE-NEXT: fmul z2.d, z2.d, z0.d +; CHECK-SVE-NEXT: st1d { z1.d }, p1, [x0, x8, lsl #3] +; CHECK-SVE-NEXT: st1d { z2.d }, p0, [x10, x8, lsl #3] +; CHECK-SVE-NEXT: add x8, x12, x8 +; CHECK-SVE-NEXT: whilelo p1.s, x8, x9 +; CHECK-SVE-NEXT: punpkhi p0.h, p1.b +; CHECK-SVE-NEXT: punpklo p1.h, p1.b +; CHECK-SVE-NEXT: mov z1.d, p1/z, #1 // =0x1 +; CHECK-SVE-NEXT: fmov x13, d1 +; CHECK-SVE-NEXT: tbnz w13, #0, .LBB5_2 +; CHECK-SVE-NEXT: .LBB5_3: // %for.cond.cleanup +; CHECK-SVE-NEXT: ret +; +; CHECK-SVE2p1-LABEL: f3: +; CHECK-SVE2p1: // %bb.0: // %entry +; CHECK-SVE2p1-NEXT: cmp w2, #1 +; CHECK-SVE2p1-NEXT: b.lt .LBB5_3 +; CHECK-SVE2p1-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2p1-NEXT: mov w9, w2 +; CHECK-SVE2p1-NEXT: addvl x10, x0, #1 +; CHECK-SVE2p1-NEXT: fmov z0.d, #3.00000000 +; CHECK-SVE2p1-NEXT: mov x8, xzr +; CHECK-SVE2p1-NEXT: whilelo { p0.d, p1.d }, xzr, x9 +; CHECK-SVE2p1-NEXT: addvl x11, x1, #1 +; CHECK-SVE2p1-NEXT: .p2align 5, , 16 +; CHECK-SVE2p1-NEXT: .LBB5_2: // %vector.body +; CHECK-SVE2p1-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2p1-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; CHECK-SVE2p1-NEXT: ld1d { z2.d }, p1/z, [x11, x8, lsl #3] +; CHECK-SVE2p1-NEXT: fmul z1.d, z1.d, z0.d +; CHECK-SVE2p1-NEXT: fmul z2.d, z2.d, z0.d +; CHECK-SVE2p1-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] +; CHECK-SVE2p1-NEXT: st1d { z2.d }, p1, [x10, x8, lsl #3] +; CHECK-SVE2p1-NEXT: incw x8 +; CHECK-SVE2p1-NEXT: whilelo { p0.d, p1.d }, x8, x9 +; CHECK-SVE2p1-NEXT: mov z1.d, p0/z, #1 // =0x1 +; CHECK-SVE2p1-NEXT: fmov x12, d1 +; CHECK-SVE2p1-NEXT: tbnz w12, #0, .LBB5_2 +; CHECK-SVE2p1-NEXT: .LBB5_3: // %for.cond.cleanup +; CHECK-SVE2p1-NEXT: ret +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 2 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) + %active.lane.mask.entry1 = tail call @llvm.vector.extract.nxv2i1.nxv4i1( %active.lane.mask.entry, i64 2) + %active.lane.mask.entry2 = tail call @llvm.vector.extract.nxv2i1.nxv4i1( %active.lane.mask.entry, i64 0) + %2 = tail call i64 @llvm.vscale.i64() + %3 = shl nuw nsw i64 %2, 1 + %4 = tail call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 1 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next5, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry2, %for.body.preheader ], [ %active.lane.mask.next7, %vector.body ] + %active.lane.mask3 = phi [ %active.lane.mask.entry1, %for.body.preheader ], [ %active.lane.mask.next6, %vector.body ] + %6 = getelementptr inbounds double, ptr %src, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv2f64.p0(ptr %6, i32 8, %active.lane.mask, poison) + %7 = getelementptr inbounds double, ptr %6, i64 %3 + %wide.masked.load4 = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull %7, i32 8, %active.lane.mask3, poison) + %8 = fmul %wide.masked.load, shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) + %9 = fmul %wide.masked.load4, shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) + %10 = getelementptr inbounds double, ptr %dst, i64 %index + tail call void @llvm.masked.store.nxv2f64.p0( %8, ptr %10, i32 8, %active.lane.mask) + %11 = getelementptr inbounds double, ptr %10, i64 %5 + tail call void @llvm.masked.store.nxv2f64.p0( %9, ptr %11, i32 8, %active.lane.mask3) + %index.next = add i64 %index, %1 + %index.next5 = add i64 %index, %1 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) + %active.lane.mask.next6 = tail call @llvm.vector.extract.nxv2i1.nxv4i1( %active.lane.mask.next, i64 2) + %active.lane.mask.next7 = tail call @llvm.vector.extract.nxv2i1.nxv4i1( %active.lane.mask.next, i64 0) + %12 = extractelement %active.lane.mask.next7, i64 0 + br i1 %12, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; Byte elements, UF=4 +define void @f_b_4(ptr noalias %dst, ptr %src, i32 %n) #0 { +; CHECK-SVE-LABEL: f_b_4: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: cmp w2, #1 +; CHECK-SVE-NEXT: b.lt .LBB6_3 +; CHECK-SVE-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE-NEXT: rdvl x14, #2 +; CHECK-SVE-NEXT: mov w8, w2 +; CHECK-SVE-NEXT: mov x9, xzr +; CHECK-SVE-NEXT: add x11, x0, x14 +; CHECK-SVE-NEXT: rdvl x13, #3 +; CHECK-SVE-NEXT: rdvl x15, #1 +; CHECK-SVE-NEXT: add x10, x0, x13 +; CHECK-SVE-NEXT: add x12, x0, x15 +; CHECK-SVE-NEXT: rdvl x16, #4 +; CHECK-SVE-NEXT: rdvl x17, #5 +; CHECK-SVE-NEXT: rdvl x18, #6 +; CHECK-SVE-NEXT: rdvl x2, #7 +; CHECK-SVE-NEXT: whilelo p0.b, x13, x8 +; CHECK-SVE-NEXT: add x13, x1, x13 +; CHECK-SVE-NEXT: whilelo p1.b, x14, x8 +; CHECK-SVE-NEXT: add x14, x1, x14 +; CHECK-SVE-NEXT: whilelo p2.b, x15, x8 +; CHECK-SVE-NEXT: add x15, x1, x15 +; CHECK-SVE-NEXT: whilelo p3.b, xzr, x8 +; CHECK-SVE-NEXT: .p2align 5, , 16 +; CHECK-SVE-NEXT: .LBB6_2: // %vector.body +; CHECK-SVE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE-NEXT: ld1b { z0.b }, p3/z, [x1, x9] +; CHECK-SVE-NEXT: ld1b { z1.b }, p2/z, [x15, x9] +; CHECK-SVE-NEXT: add x3, x2, x9 +; CHECK-SVE-NEXT: mul z0.b, z0.b, #3 +; CHECK-SVE-NEXT: ld1b { z2.b }, p1/z, [x14, x9] +; CHECK-SVE-NEXT: ld1b { z3.b }, p0/z, [x13, x9] +; CHECK-SVE-NEXT: mul z1.b, z1.b, #3 +; CHECK-SVE-NEXT: mul z2.b, z2.b, #3 +; CHECK-SVE-NEXT: mul z3.b, z3.b, #3 +; CHECK-SVE-NEXT: st1b { z0.b }, p3, [x0, x9] +; CHECK-SVE-NEXT: st1b { z1.b }, p2, [x12, x9] +; CHECK-SVE-NEXT: st1b { z2.b }, p1, [x11, x9] +; CHECK-SVE-NEXT: st1b { z3.b }, p0, [x10, x9] +; CHECK-SVE-NEXT: whilelo p0.b, x3, x8 +; CHECK-SVE-NEXT: add x3, x18, x9 +; CHECK-SVE-NEXT: whilelo p1.b, x3, x8 +; CHECK-SVE-NEXT: add x3, x17, x9 +; CHECK-SVE-NEXT: add x9, x16, x9 +; CHECK-SVE-NEXT: whilelo p2.b, x3, x8 +; CHECK-SVE-NEXT: whilelo p3.b, x9, x8 +; CHECK-SVE-NEXT: b.mi .LBB6_2 +; CHECK-SVE-NEXT: .LBB6_3: // %for.cond.cleanup +; CHECK-SVE-NEXT: ret +; +; CHECK-SVE2p1-LABEL: f_b_4: +; CHECK-SVE2p1: // %bb.0: // %entry +; CHECK-SVE2p1-NEXT: cmp w2, #1 +; CHECK-SVE2p1-NEXT: b.lt .LBB6_3 +; CHECK-SVE2p1-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2p1-NEXT: rdvl x10, #1 +; CHECK-SVE2p1-NEXT: mov w8, w2 +; CHECK-SVE2p1-NEXT: mov x9, xzr +; CHECK-SVE2p1-NEXT: rdvl x11, #2 +; CHECK-SVE2p1-NEXT: rdvl x12, #3 +; CHECK-SVE2p1-NEXT: addvl x13, x1, #3 +; CHECK-SVE2p1-NEXT: addvl x14, x1, #2 +; CHECK-SVE2p1-NEXT: addvl x15, x1, #1 +; CHECK-SVE2p1-NEXT: whilelo p3.b, xzr, x8 +; CHECK-SVE2p1-NEXT: whilelo p0.b, x12, x8 +; CHECK-SVE2p1-NEXT: addvl x12, x0, #1 +; CHECK-SVE2p1-NEXT: whilelo p1.b, x11, x8 +; CHECK-SVE2p1-NEXT: addvl x11, x0, #2 +; CHECK-SVE2p1-NEXT: whilelo p2.b, x10, x8 +; CHECK-SVE2p1-NEXT: addvl x10, x0, #3 +; CHECK-SVE2p1-NEXT: .p2align 5, , 16 +; CHECK-SVE2p1-NEXT: .LBB6_2: // %vector.body +; CHECK-SVE2p1-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2p1-NEXT: ld1b { z0.b }, p3/z, [x1, x9] +; CHECK-SVE2p1-NEXT: ld1b { z1.b }, p2/z, [x15, x9] +; CHECK-SVE2p1-NEXT: addvl x16, x9, #5 +; CHECK-SVE2p1-NEXT: addvl x17, x9, #6 +; CHECK-SVE2p1-NEXT: ld1b { z2.b }, p1/z, [x14, x9] +; CHECK-SVE2p1-NEXT: ld1b { z3.b }, p0/z, [x13, x9] +; CHECK-SVE2p1-NEXT: mul z0.b, z0.b, #3 +; CHECK-SVE2p1-NEXT: mul z1.b, z1.b, #3 +; CHECK-SVE2p1-NEXT: mul z2.b, z2.b, #3 +; CHECK-SVE2p1-NEXT: mul z3.b, z3.b, #3 +; CHECK-SVE2p1-NEXT: addvl x18, x9, #7 +; CHECK-SVE2p1-NEXT: st1b { z0.b }, p3, [x0, x9] +; CHECK-SVE2p1-NEXT: st1b { z1.b }, p2, [x12, x9] +; CHECK-SVE2p1-NEXT: st1b { z2.b }, p1, [x11, x9] +; CHECK-SVE2p1-NEXT: st1b { z3.b }, p0, [x10, x9] +; CHECK-SVE2p1-NEXT: addvl x9, x9, #4 +; CHECK-SVE2p1-NEXT: whilelo p0.b, x18, x8 +; CHECK-SVE2p1-NEXT: whilelo p1.b, x17, x8 +; CHECK-SVE2p1-NEXT: whilelo p2.b, x16, x8 +; CHECK-SVE2p1-NEXT: whilelo p3.b, x9, x8 +; CHECK-SVE2p1-NEXT: b.mi .LBB6_2 +; CHECK-SVE2p1-NEXT: .LBB6_3: // %for.cond.cleanup +; CHECK-SVE2p1-NEXT: ret +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 6 + %2 = tail call i64 @llvm.vscale.i64() + %3 = shl nuw nsw i64 %2, 4 + %4 = tail call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 5 + %6 = tail call i64 @llvm.vscale.i64() + %7 = mul nuw nsw i64 %6, 48 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %7, i64 %wide.trip.count) + %active.lane.mask.entry12 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %5, i64 %wide.trip.count) + %active.lane.mask.entry13 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %3, i64 %wide.trip.count) + %active.lane.mask.entry14 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) + %8 = tail call i64 @llvm.vscale.i64() + %9 = shl nuw nsw i64 %8, 4 + %10 = tail call i64 @llvm.vscale.i64() + %11 = shl nuw nsw i64 %10, 5 + %12 = tail call i64 @llvm.vscale.i64() + %13 = mul nuw nsw i64 %12, 48 + %14 = tail call i64 @llvm.vscale.i64() + %15 = shl nuw nsw i64 %14, 4 + %16 = tail call i64 @llvm.vscale.i64() + %17 = shl nuw nsw i64 %16, 5 + %18 = tail call i64 @llvm.vscale.i64() + %19 = mul nuw nsw i64 %18, 48 + %20 = tail call i64 @llvm.vscale.i64() + %21 = shl nuw nsw i64 %20, 4 + %22 = tail call i64 @llvm.vscale.i64() + %23 = shl nuw nsw i64 %22, 5 + %24 = tail call i64 @llvm.vscale.i64() + %25 = mul nuw nsw i64 %24, 48 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next23, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry14, %for.body.preheader ], [ %active.lane.mask.next26, %vector.body ] + %active.lane.mask15 = phi [ %active.lane.mask.entry13, %for.body.preheader ], [ %active.lane.mask.next25, %vector.body ] + %active.lane.mask16 = phi [ %active.lane.mask.entry12, %for.body.preheader ], [ %active.lane.mask.next24, %vector.body ] + %active.lane.mask17 = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %26 = getelementptr inbounds i8, ptr %src, i64 %index + %27 = getelementptr inbounds i8, ptr %26, i64 %9 + %28 = getelementptr inbounds i8, ptr %26, i64 %11 + %29 = getelementptr inbounds i8, ptr %26, i64 %13 + %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %26, i32 1, %active.lane.mask, poison) + %wide.masked.load18 = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull %27, i32 1, %active.lane.mask15, poison) + %wide.masked.load19 = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull %28, i32 1, %active.lane.mask16, poison) + %wide.masked.load20 = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull %29, i32 1, %active.lane.mask17, poison) + %30 = mul %wide.masked.load, shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) + %31 = mul %wide.masked.load18, shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) + %32 = mul %wide.masked.load19, shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) + %33 = mul %wide.masked.load20, shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) + %34 = getelementptr inbounds i8, ptr %dst, i64 %index + %35 = getelementptr inbounds i8, ptr %34, i64 %15 + %36 = getelementptr inbounds i8, ptr %34, i64 %17 + %37 = getelementptr inbounds i8, ptr %34, i64 %19 + tail call void @llvm.masked.store.nxv16i8.p0( %30, ptr %34, i32 1, %active.lane.mask) + tail call void @llvm.masked.store.nxv16i8.p0( %31, ptr %35, i32 1, %active.lane.mask15) + tail call void @llvm.masked.store.nxv16i8.p0( %32, ptr %36, i32 1, %active.lane.mask16) + tail call void @llvm.masked.store.nxv16i8.p0( %33, ptr %37, i32 1, %active.lane.mask17) + %index.next = add i64 %index, %1 + %index.next23 = add i64 %index, %1 + %38 = add i64 %index.next, %21 + %39 = add i64 %index.next, %23 + %40 = add i64 %index.next, %25 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %40, i64 %wide.trip.count) + %active.lane.mask.next24 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %39, i64 %wide.trip.count) + %active.lane.mask.next25 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %38, i64 %wide.trip.count) + %active.lane.mask.next26 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) + %41 = extractelement %active.lane.mask.next26, i64 0 + br i1 %41, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; Halfword elements, UF=4 +define void @f_h_4(ptr noalias %dst, ptr %src, i32 %n) #0 { +; CHECK-SVE-LABEL: f_h_4: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: cmp w2, #1 +; CHECK-SVE-NEXT: b.lt .LBB7_3 +; CHECK-SVE-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE-NEXT: rdvl x17, #1 +; CHECK-SVE-NEXT: mov w8, w2 +; CHECK-SVE-NEXT: mov x10, xzr +; CHECK-SVE-NEXT: add x14, x0, x17 +; CHECK-SVE-NEXT: whilelo p1.b, x17, x8 +; CHECK-SVE-NEXT: add x17, x1, x17 +; CHECK-SVE-NEXT: rdvl x9, #2 +; CHECK-SVE-NEXT: add x13, x0, x9 +; CHECK-SVE-NEXT: add x16, x1, x9 +; CHECK-SVE-NEXT: rdvl x11, #3 +; CHECK-SVE-NEXT: add x12, x0, x11 +; CHECK-SVE-NEXT: add x15, x1, x11 +; CHECK-SVE-NEXT: punpkhi p0.h, p1.b +; CHECK-SVE-NEXT: punpklo p1.h, p1.b +; CHECK-SVE-NEXT: whilelo p3.b, xzr, x8 +; CHECK-SVE-NEXT: punpkhi p2.h, p3.b +; CHECK-SVE-NEXT: punpklo p3.h, p3.b +; CHECK-SVE-NEXT: .p2align 5, , 16 +; CHECK-SVE-NEXT: .LBB7_2: // %vector.body +; CHECK-SVE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE-NEXT: ld1h { z0.h }, p3/z, [x1, x10, lsl #1] +; CHECK-SVE-NEXT: ld1h { z1.h }, p2/z, [x17, x10, lsl #1] +; CHECK-SVE-NEXT: add x18, x11, x10 +; CHECK-SVE-NEXT: mul z0.h, z0.h, #3 +; CHECK-SVE-NEXT: ld1h { z2.h }, p1/z, [x16, x10, lsl #1] +; CHECK-SVE-NEXT: ld1h { z3.h }, p0/z, [x15, x10, lsl #1] +; CHECK-SVE-NEXT: mul z1.h, z1.h, #3 +; CHECK-SVE-NEXT: mul z2.h, z2.h, #3 +; CHECK-SVE-NEXT: mul z3.h, z3.h, #3 +; CHECK-SVE-NEXT: st1h { z0.h }, p3, [x0, x10, lsl #1] +; CHECK-SVE-NEXT: st1h { z1.h }, p2, [x14, x10, lsl #1] +; CHECK-SVE-NEXT: st1h { z2.h }, p1, [x13, x10, lsl #1] +; CHECK-SVE-NEXT: st1h { z3.h }, p0, [x12, x10, lsl #1] +; CHECK-SVE-NEXT: add x10, x9, x10 +; CHECK-SVE-NEXT: whilelo p1.b, x18, x8 +; CHECK-SVE-NEXT: whilelo p3.b, x10, x8 +; CHECK-SVE-NEXT: punpkhi p0.h, p1.b +; CHECK-SVE-NEXT: punpklo p1.h, p1.b +; CHECK-SVE-NEXT: punpkhi p2.h, p3.b +; CHECK-SVE-NEXT: punpklo p3.h, p3.b +; CHECK-SVE-NEXT: mov z0.h, p3/z, #1 // =0x1 +; CHECK-SVE-NEXT: fmov w18, s0 +; CHECK-SVE-NEXT: tbnz w18, #0, .LBB7_2 +; CHECK-SVE-NEXT: .LBB7_3: // %for.cond.cleanup +; CHECK-SVE-NEXT: ret +; +; CHECK-SVE2p1-LABEL: f_h_4: +; CHECK-SVE2p1: // %bb.0: // %entry +; CHECK-SVE2p1-NEXT: cmp w2, #1 +; CHECK-SVE2p1-NEXT: b.lt .LBB7_3 +; CHECK-SVE2p1-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2p1-NEXT: rdvl x10, #1 +; CHECK-SVE2p1-NEXT: mov w8, w2 +; CHECK-SVE2p1-NEXT: mov x9, xzr +; CHECK-SVE2p1-NEXT: whilelo { p2.h, p3.h }, xzr, x8 +; CHECK-SVE2p1-NEXT: addvl x11, x0, #2 +; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, x10, x8 +; CHECK-SVE2p1-NEXT: addvl x10, x0, #3 +; CHECK-SVE2p1-NEXT: addvl x12, x0, #1 +; CHECK-SVE2p1-NEXT: addvl x13, x1, #3 +; CHECK-SVE2p1-NEXT: addvl x14, x1, #2 +; CHECK-SVE2p1-NEXT: addvl x15, x1, #1 +; CHECK-SVE2p1-NEXT: .p2align 5, , 16 +; CHECK-SVE2p1-NEXT: .LBB7_2: // %vector.body +; CHECK-SVE2p1-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2p1-NEXT: ld1h { z0.h }, p2/z, [x1, x9, lsl #1] +; CHECK-SVE2p1-NEXT: ld1h { z1.h }, p3/z, [x15, x9, lsl #1] +; CHECK-SVE2p1-NEXT: addvl x16, x9, #3 +; CHECK-SVE2p1-NEXT: mul z0.h, z0.h, #3 +; CHECK-SVE2p1-NEXT: ld1h { z2.h }, p0/z, [x14, x9, lsl #1] +; CHECK-SVE2p1-NEXT: ld1h { z3.h }, p1/z, [x13, x9, lsl #1] +; CHECK-SVE2p1-NEXT: mul z1.h, z1.h, #3 +; CHECK-SVE2p1-NEXT: mul z2.h, z2.h, #3 +; CHECK-SVE2p1-NEXT: mul z3.h, z3.h, #3 +; CHECK-SVE2p1-NEXT: st1h { z0.h }, p2, [x0, x9, lsl #1] +; CHECK-SVE2p1-NEXT: st1h { z1.h }, p3, [x12, x9, lsl #1] +; CHECK-SVE2p1-NEXT: st1h { z2.h }, p0, [x11, x9, lsl #1] +; CHECK-SVE2p1-NEXT: st1h { z3.h }, p1, [x10, x9, lsl #1] +; CHECK-SVE2p1-NEXT: addvl x9, x9, #2 +; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, x16, x8 +; CHECK-SVE2p1-NEXT: whilelo { p2.h, p3.h }, x9, x8 +; CHECK-SVE2p1-NEXT: mov z0.h, p2/z, #1 // =0x1 +; CHECK-SVE2p1-NEXT: fmov w16, s0 +; CHECK-SVE2p1-NEXT: tbnz w16, #0, .LBB7_2 +; CHECK-SVE2p1-NEXT: .LBB7_3: // %for.cond.cleanup +; CHECK-SVE2p1-NEXT: ret +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 5 + %2 = tail call i64 @llvm.vscale.i64() + %3 = shl nuw nsw i64 %2, 4 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %3, i64 %wide.trip.count) + %active.lane.mask.entry12 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) + %active.lane.mask.entry13 = tail call @llvm.vector.extract.nxv8i1.nxv16i1( %active.lane.mask.entry, i64 8) + %active.lane.mask.entry14 = tail call @llvm.vector.extract.nxv8i1.nxv16i1( %active.lane.mask.entry, i64 0) + %active.lane.mask.entry15 = tail call @llvm.vector.extract.nxv8i1.nxv16i1( %active.lane.mask.entry12, i64 8) + %active.lane.mask.entry16 = tail call @llvm.vector.extract.nxv8i1.nxv16i1( %active.lane.mask.entry12, i64 0) + %4 = tail call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 3 + %6 = tail call i64 @llvm.vscale.i64() + %7 = shl nuw nsw i64 %6, 4 + %8 = tail call i64 @llvm.vscale.i64() + %9 = mul nuw nsw i64 %8, 24 + %10 = tail call i64 @llvm.vscale.i64() + %11 = shl nuw nsw i64 %10, 3 + %12 = tail call i64 @llvm.vscale.i64() + %13 = shl nuw nsw i64 %12, 4 + %14 = tail call i64 @llvm.vscale.i64() + %15 = mul nuw nsw i64 %14, 24 + %16 = tail call i64 @llvm.vscale.i64() + %17 = shl nuw nsw i64 %16, 4 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next25, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry16, %for.body.preheader ], [ %active.lane.mask.next30, %vector.body ] + %active.lane.mask17 = phi [ %active.lane.mask.entry15, %for.body.preheader ], [ %active.lane.mask.next29, %vector.body ] + %active.lane.mask18 = phi [ %active.lane.mask.entry14, %for.body.preheader ], [ %active.lane.mask.next28, %vector.body ] + %active.lane.mask19 = phi [ %active.lane.mask.entry13, %for.body.preheader ], [ %active.lane.mask.next27, %vector.body ] + %18 = getelementptr inbounds i16, ptr %src, i64 %index + %19 = getelementptr inbounds i16, ptr %18, i64 %5 + %20 = getelementptr inbounds i16, ptr %18, i64 %7 + %21 = getelementptr inbounds i16, ptr %18, i64 %9 + %wide.masked.load = tail call @llvm.masked.load.nxv8i16.p0(ptr %18, i32 2, %active.lane.mask, poison) + %wide.masked.load20 = tail call @llvm.masked.load.nxv8i16.p0(ptr nonnull %19, i32 2, %active.lane.mask17, poison) + %wide.masked.load21 = tail call @llvm.masked.load.nxv8i16.p0(ptr nonnull %20, i32 2, %active.lane.mask18, poison) + %wide.masked.load22 = tail call @llvm.masked.load.nxv8i16.p0(ptr nonnull %21, i32 2, %active.lane.mask19, poison) + %22 = mul %wide.masked.load, shufflevector ( insertelement ( poison, i16 3, i64 0), poison, zeroinitializer) + %23 = mul %wide.masked.load20, shufflevector ( insertelement ( poison, i16 3, i64 0), poison, zeroinitializer) + %24 = mul %wide.masked.load21, shufflevector ( insertelement ( poison, i16 3, i64 0), poison, zeroinitializer) + %25 = mul %wide.masked.load22, shufflevector ( insertelement ( poison, i16 3, i64 0), poison, zeroinitializer) + %26 = getelementptr inbounds i16, ptr %dst, i64 %index + %27 = getelementptr inbounds i16, ptr %26, i64 %11 + %28 = getelementptr inbounds i16, ptr %26, i64 %13 + %29 = getelementptr inbounds i16, ptr %26, i64 %15 + tail call void @llvm.masked.store.nxv8i16.p0( %22, ptr %26, i32 2, %active.lane.mask) + tail call void @llvm.masked.store.nxv8i16.p0( %23, ptr %27, i32 2, %active.lane.mask17) + tail call void @llvm.masked.store.nxv8i16.p0( %24, ptr %28, i32 2, %active.lane.mask18) + tail call void @llvm.masked.store.nxv8i16.p0( %25, ptr %29, i32 2, %active.lane.mask19) + %index.next = add i64 %index, %1 + %index.next25 = add i64 %index, %1 + %30 = add i64 %index.next, %17 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %30, i64 %wide.trip.count) + %active.lane.mask.next26 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) + %active.lane.mask.next27 = tail call @llvm.vector.extract.nxv8i1.nxv16i1( %active.lane.mask.next, i64 8) + %active.lane.mask.next28 = tail call @llvm.vector.extract.nxv8i1.nxv16i1( %active.lane.mask.next, i64 0) + %active.lane.mask.next29 = tail call @llvm.vector.extract.nxv8i1.nxv16i1( %active.lane.mask.next26, i64 8) + %active.lane.mask.next30 = tail call @llvm.vector.extract.nxv8i1.nxv16i1( %active.lane.mask.next26, i64 0) + %31 = extractelement %active.lane.mask.next30, i64 0 + br i1 %31, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + + +; Double-precision float elements, UF=4 +define void @f_d_4(ptr noalias %dst, ptr readonly %src, i32 %n) #0 { +; CHECK-SVE-LABEL: f_d_4: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: cmp w2, #1 +; CHECK-SVE-NEXT: b.lt .LBB8_3 +; CHECK-SVE-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE-NEXT: cntw x10 +; CHECK-SVE-NEXT: mov w8, w2 +; CHECK-SVE-NEXT: fmov z0.d, #3.00000000 +; CHECK-SVE-NEXT: mov x9, xzr +; CHECK-SVE-NEXT: rdvl x13, #3 +; CHECK-SVE-NEXT: rdvl x14, #2 +; CHECK-SVE-NEXT: add x11, x0, x14 +; CHECK-SVE-NEXT: add x14, x1, x14 +; CHECK-SVE-NEXT: rdvl x15, #1 +; CHECK-SVE-NEXT: add x12, x0, x15 +; CHECK-SVE-NEXT: add x15, x1, x15 +; CHECK-SVE-NEXT: cnth x16 +; CHECK-SVE-NEXT: cntw x17, all, mul #3 +; CHECK-SVE-NEXT: whilelo p1.s, x10, x8 +; CHECK-SVE-NEXT: add x10, x0, x13 +; CHECK-SVE-NEXT: add x13, x1, x13 +; CHECK-SVE-NEXT: punpkhi p0.h, p1.b +; CHECK-SVE-NEXT: punpklo p1.h, p1.b +; CHECK-SVE-NEXT: whilelo p3.s, xzr, x8 +; CHECK-SVE-NEXT: punpkhi p2.h, p3.b +; CHECK-SVE-NEXT: punpklo p3.h, p3.b +; CHECK-SVE-NEXT: .p2align 5, , 16 +; CHECK-SVE-NEXT: .LBB8_2: // %vector.body +; CHECK-SVE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE-NEXT: ld1d { z1.d }, p3/z, [x1, x9, lsl #3] +; CHECK-SVE-NEXT: ld1d { z2.d }, p2/z, [x15, x9, lsl #3] +; CHECK-SVE-NEXT: add x18, x17, x9 +; CHECK-SVE-NEXT: fmul z1.d, z1.d, z0.d +; CHECK-SVE-NEXT: ld1d { z3.d }, p1/z, [x14, x9, lsl #3] +; CHECK-SVE-NEXT: ld1d { z4.d }, p0/z, [x13, x9, lsl #3] +; CHECK-SVE-NEXT: fmul z2.d, z2.d, z0.d +; CHECK-SVE-NEXT: fmul z3.d, z3.d, z0.d +; CHECK-SVE-NEXT: fmul z4.d, z4.d, z0.d +; CHECK-SVE-NEXT: st1d { z1.d }, p3, [x0, x9, lsl #3] +; CHECK-SVE-NEXT: st1d { z2.d }, p2, [x12, x9, lsl #3] +; CHECK-SVE-NEXT: st1d { z3.d }, p1, [x11, x9, lsl #3] +; CHECK-SVE-NEXT: st1d { z4.d }, p0, [x10, x9, lsl #3] +; CHECK-SVE-NEXT: add x9, x16, x9 +; CHECK-SVE-NEXT: whilelo p1.s, x18, x8 +; CHECK-SVE-NEXT: whilelo p3.s, x9, x8 +; CHECK-SVE-NEXT: punpkhi p0.h, p1.b +; CHECK-SVE-NEXT: punpklo p1.h, p1.b +; CHECK-SVE-NEXT: punpkhi p2.h, p3.b +; CHECK-SVE-NEXT: punpklo p3.h, p3.b +; CHECK-SVE-NEXT: mov z1.d, p3/z, #1 // =0x1 +; CHECK-SVE-NEXT: fmov x18, d1 +; CHECK-SVE-NEXT: tbnz w18, #0, .LBB8_2 +; CHECK-SVE-NEXT: .LBB8_3: // %for.cond.cleanup +; CHECK-SVE-NEXT: ret +; +; CHECK-SVE2p1-LABEL: f_d_4: +; CHECK-SVE2p1: // %bb.0: // %entry +; CHECK-SVE2p1-NEXT: cmp w2, #1 +; CHECK-SVE2p1-NEXT: b.lt .LBB8_3 +; CHECK-SVE2p1-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2p1-NEXT: cntw x10 +; CHECK-SVE2p1-NEXT: mov w8, w2 +; CHECK-SVE2p1-NEXT: fmov z0.d, #3.00000000 +; CHECK-SVE2p1-NEXT: mov x9, xzr +; CHECK-SVE2p1-NEXT: mov x15, xzr +; CHECK-SVE2p1-NEXT: whilelo { p2.d, p3.d }, xzr, x8 +; CHECK-SVE2p1-NEXT: addvl x11, x0, #2 +; CHECK-SVE2p1-NEXT: whilelo { p0.d, p1.d }, x10, x8 +; CHECK-SVE2p1-NEXT: addvl x10, x0, #3 +; CHECK-SVE2p1-NEXT: addvl x12, x0, #1 +; CHECK-SVE2p1-NEXT: addvl x13, x1, #3 +; CHECK-SVE2p1-NEXT: addvl x14, x1, #2 +; CHECK-SVE2p1-NEXT: addvl x16, x1, #1 +; CHECK-SVE2p1-NEXT: .p2align 5, , 16 +; CHECK-SVE2p1-NEXT: .LBB8_2: // %vector.body +; CHECK-SVE2p1-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2p1-NEXT: ld1d { z1.d }, p2/z, [x1, x9, lsl #3] +; CHECK-SVE2p1-NEXT: ld1d { z2.d }, p3/z, [x16, x9, lsl #3] +; CHECK-SVE2p1-NEXT: inch x15 +; CHECK-SVE2p1-NEXT: fmul z1.d, z1.d, z0.d +; CHECK-SVE2p1-NEXT: ld1d { z3.d }, p0/z, [x14, x9, lsl #3] +; CHECK-SVE2p1-NEXT: ld1d { z4.d }, p1/z, [x13, x9, lsl #3] +; CHECK-SVE2p1-NEXT: fmul z2.d, z2.d, z0.d +; CHECK-SVE2p1-NEXT: fmul z3.d, z3.d, z0.d +; CHECK-SVE2p1-NEXT: fmul z4.d, z4.d, z0.d +; CHECK-SVE2p1-NEXT: st1d { z1.d }, p2, [x0, x9, lsl #3] +; CHECK-SVE2p1-NEXT: st1d { z2.d }, p3, [x12, x9, lsl #3] +; CHECK-SVE2p1-NEXT: whilelo { p2.d, p3.d }, x15, x8 +; CHECK-SVE2p1-NEXT: mov z1.d, p2/z, #1 // =0x1 +; CHECK-SVE2p1-NEXT: st1d { z3.d }, p0, [x11, x9, lsl #3] +; CHECK-SVE2p1-NEXT: st1d { z4.d }, p1, [x10, x9, lsl #3] +; CHECK-SVE2p1-NEXT: incw x9, all, mul #3 +; CHECK-SVE2p1-NEXT: whilelo { p0.d, p1.d }, x9, x8 +; CHECK-SVE2p1-NEXT: mov x9, x15 +; CHECK-SVE2p1-NEXT: fmov x17, d1 +; CHECK-SVE2p1-NEXT: tbnz w17, #0, .LBB8_2 +; CHECK-SVE2p1-NEXT: .LBB8_3: // %for.cond.cleanup +; CHECK-SVE2p1-NEXT: ret +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 3 + %2 = tail call i64 @llvm.vscale.i64() + %3 = shl nuw nsw i64 %2, 2 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %3, i64 %wide.trip.count) + %active.lane.mask.entry3 = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) + %active.lane.mask.entry4 = tail call @llvm.vector.extract.nxv2i1.nxv4i1( %active.lane.mask.entry, i64 2) + %active.lane.mask.entry5 = tail call @llvm.vector.extract.nxv2i1.nxv4i1( %active.lane.mask.entry, i64 0) + %active.lane.mask.entry6 = tail call @llvm.vector.extract.nxv2i1.nxv4i1( %active.lane.mask.entry3, i64 2) + %active.lane.mask.entry7 = tail call @llvm.vector.extract.nxv2i1.nxv4i1( %active.lane.mask.entry3, i64 0) + %4 = tail call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 1 + %6 = tail call i64 @llvm.vscale.i64() + %7 = shl nuw nsw i64 %6, 2 + %8 = tail call i64 @llvm.vscale.i64() + %9 = mul nuw nsw i64 %8, 6 + %10 = tail call i64 @llvm.vscale.i64() + %11 = shl nuw nsw i64 %10, 1 + %12 = tail call i64 @llvm.vscale.i64() + %13 = shl nuw nsw i64 %12, 2 + %14 = tail call i64 @llvm.vscale.i64() + %15 = mul nuw nsw i64 %14, 6 + %16 = tail call i64 @llvm.vscale.i64() + %17 = shl nuw nsw i64 %16, 2 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next16, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry7, %for.body.preheader ], [ %active.lane.mask.next21, %vector.body ] + %active.lane.mask8 = phi [ %active.lane.mask.entry6, %for.body.preheader ], [ %active.lane.mask.next20, %vector.body ] + %active.lane.mask9 = phi [ %active.lane.mask.entry5, %for.body.preheader ], [ %active.lane.mask.next19, %vector.body ] + %active.lane.mask10 = phi [ %active.lane.mask.entry4, %for.body.preheader ], [ %active.lane.mask.next18, %vector.body ] + %18 = getelementptr inbounds double, ptr %src, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv2f64.p0(ptr %18, i32 8, %active.lane.mask, poison) + %19 = getelementptr inbounds double, ptr %18, i64 %5 + %wide.masked.load11 = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull %19, i32 8, %active.lane.mask8, poison) + %20 = getelementptr inbounds double, ptr %18, i64 %7 + %wide.masked.load12 = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull %20, i32 8, %active.lane.mask9, poison) + %21 = getelementptr inbounds double, ptr %18, i64 %9 + %wide.masked.load13 = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull %21, i32 8, %active.lane.mask10, poison) + %22 = fmul %wide.masked.load, shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) + %23 = fmul %wide.masked.load11, shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) + %24 = fmul %wide.masked.load12, shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) + %25 = fmul %wide.masked.load13, shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) + %26 = getelementptr inbounds double, ptr %dst, i64 %index + tail call void @llvm.masked.store.nxv2f64.p0( %22, ptr %26, i32 8, %active.lane.mask) + %27 = getelementptr inbounds double, ptr %26, i64 %11 + tail call void @llvm.masked.store.nxv2f64.p0( %23, ptr %27, i32 8, %active.lane.mask8) + %28 = getelementptr inbounds double, ptr %26, i64 %13 + tail call void @llvm.masked.store.nxv2f64.p0( %24, ptr %28, i32 8, %active.lane.mask9) + %29 = getelementptr inbounds double, ptr %26, i64 %15 + tail call void @llvm.masked.store.nxv2f64.p0( %25, ptr %29, i32 8, %active.lane.mask10) + %index.next = add i64 %index, %1 + %index.next16 = add i64 %index, %1 + %30 = add i64 %index.next, %17 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %30, i64 %wide.trip.count) + %active.lane.mask.next17 = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) + %active.lane.mask.next18 = tail call @llvm.vector.extract.nxv2i1.nxv4i1( %active.lane.mask.next, i64 2) + %active.lane.mask.next19 = tail call @llvm.vector.extract.nxv2i1.nxv4i1( %active.lane.mask.next, i64 0) + %active.lane.mask.next20 = tail call @llvm.vector.extract.nxv2i1.nxv4i1( %active.lane.mask.next17, i64 2) + %active.lane.mask.next21 = tail call @llvm.vector.extract.nxv2i1.nxv4i1( %active.lane.mask.next17, i64 0) + %31 = extractelement %active.lane.mask.next21, i64 0 + br i1 %31, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +declare @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64) +declare @llvm.vector.extract.nxv16i1.nxv32i1(, i64 immarg) +declare @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, , ) +declare @llvm.masked.load.nxv2f64.p0(ptr nocapture, i32 immarg, , ) +declare @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) +declare @llvm.vector.extract.nxv2i1.nxv4i1(, i64 immarg) +declare @llvm.get.active.lane.mask.nxv32i1.i64(i64, i64) +declare @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) +declare i64 @llvm.vscale.i64() +declare void @llvm.masked.store.nxv16i8.p0(, ptr nocapture, i32 immarg, ) +declare void @llvm.masked.store.nxv2f64.p0(, ptr nocapture, i32 immarg, ) + +attributes #0 = { nounwind vscale_range(1,16) "target-cpu"="neoverse-v1" } + diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 200c2adcf0e66..a36fce5f8f690 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -93,38 +93,35 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) ; TFA_INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFA_INTERLEAVE: vector.body: ; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP9]], i64 [[TMP11]] -; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK2]], poison) -; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) -; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD3]], [[ACTIVE_LANE_MASK2]]) -; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i64 [[TMP17]] -; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP13]], ptr [[TMP15]], i32 8, [[ACTIVE_LANE_MASK]]) -; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP14]], ptr [[TMP18]], i32 8, [[ACTIVE_LANE_MASK2]]) -; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]] -; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = add i64 [[INDEX_NEXT]], [[TMP20]] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP21]], i64 1025) -; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement [[TMP22]], i32 0 -; TFA_INTERLEAVE-NEXT: br i1 [[TMP23]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY2]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP7]], i64 [[TMP9]] +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK3]], poison) +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD4]], [[ACTIVE_LANE_MASK3]]) +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 [[TMP15]] +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP11]], ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP12]], ptr [[TMP16]], i32 8, [[ACTIVE_LANE_MASK3]]) +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 +; TFA_INTERLEAVE-NEXT: br i1 [[TMP18]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TFA_INTERLEAVE: for.cond.cleanup: ; TFA_INTERLEAVE-NEXT: ret void ; @@ -224,13 +221,13 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[TMP11:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; TFCOMMON-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer ; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], zeroinitializer, [[TMP10]] -; TFCOMMON-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP14]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFCOMMON-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK]]) ; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) -; TFCOMMON-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFCOMMON-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 -; TFCOMMON-NEXT: br i1 [[TMP16]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; TFCOMMON-NEXT: [[TMP14:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFCOMMON-NEXT: [[TMP15:%.*]] = extractelement [[TMP14]], i32 0 +; TFCOMMON-NEXT: br i1 [[TMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; TFCOMMON: for.cond.cleanup: ; TFCOMMON-NEXT: ret void ; @@ -246,48 +243,45 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) ; TFA_INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFA_INTERLEAVE: vector.body: ; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 [[TMP11]] -; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK2]], poison) -; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = icmp ugt [[WIDE_MASKED_LOAD3]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP14]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP15]]) -; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD3]], [[TMP16]]) -; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = xor [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP19]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP20]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP21]], zeroinitializer, [[TMP17]] -; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select [[TMP22]], zeroinitializer, [[TMP18]] -; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP25]], i64 [[TMP27]] -; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP25]], i32 8, [[ACTIVE_LANE_MASK]]) -; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI4]], ptr [[TMP28]], i32 8, [[ACTIVE_LANE_MASK2]]) -; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]] -; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP31:%.*]] = add i64 [[INDEX_NEXT]], [[TMP30]] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP31]], i64 1025) -; TFA_INTERLEAVE-NEXT: [[TMP32:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP33:%.*]] = extractelement [[TMP32]], i32 0 -; TFA_INTERLEAVE-NEXT: br i1 [[TMP33]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY2]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT7:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 [[TMP9]] +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK3]], poison) +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = icmp ugt [[WIDE_MASKED_LOAD4]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK3]], [[TMP12]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP13]]) +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD4]], [[TMP14]]) +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = xor [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = xor [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK3]], [[TMP18]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP19]], zeroinitializer, [[TMP15]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI5:%.*]] = select [[TMP20]], zeroinitializer, [[TMP16]] +; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i64 [[TMP23]] +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP21]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI5]], ptr [[TMP24]], i32 8, [[ACTIVE_LANE_MASK3]]) +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT7]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement [[TMP25]], i32 0 +; TFA_INTERLEAVE-NEXT: br i1 [[TMP26]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; TFA_INTERLEAVE: for.cond.cleanup: ; TFA_INTERLEAVE-NEXT: ret void ; @@ -403,13 +397,13 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP8]], zeroinitializer ; TFCOMMON-NEXT: [[TMP13:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP12]]) ; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], [[TMP11]], [[TMP13]] -; TFCOMMON-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP15]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFCOMMON-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP14]], i32 8, [[ACTIVE_LANE_MASK]]) ; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) -; TFCOMMON-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFCOMMON-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 -; TFCOMMON-NEXT: br i1 [[TMP17]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TFCOMMON-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFCOMMON-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 +; TFCOMMON-NEXT: br i1 [[TMP16]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TFCOMMON: for.cond.cleanup: ; TFCOMMON-NEXT: ret void ; @@ -425,50 +419,47 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) ; TFA_INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFA_INTERLEAVE: vector.body: ; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 [[TMP11]] -; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK2]], poison) -; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = icmp ugt [[WIDE_MASKED_LOAD3]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP16]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = call @foo_vector( zeroinitializer, [[TMP17]]) -; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = call @foo_vector( zeroinitializer, [[TMP18]]) -; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP14]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP21]]) -; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD3]], [[TMP22]]) -; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP17]], [[TMP19]], [[TMP23]] -; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select [[TMP18]], [[TMP20]], [[TMP24]] -; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i64 [[TMP29]] -; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP27]], i32 8, [[ACTIVE_LANE_MASK]]) -; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI4]], ptr [[TMP30]], i32 8, [[ACTIVE_LANE_MASK2]]) -; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]] -; TFA_INTERLEAVE-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP33:%.*]] = add i64 [[INDEX_NEXT]], [[TMP32]] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP33]], i64 1025) -; TFA_INTERLEAVE-NEXT: [[TMP34:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP35:%.*]] = extractelement [[TMP34]], i32 0 -; TFA_INTERLEAVE-NEXT: br i1 [[TMP35]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY2]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT7:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 [[TMP9]] +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK3]], poison) +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = icmp ugt [[WIDE_MASKED_LOAD4]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = xor [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = xor [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK3]], [[TMP14]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = call @foo_vector( zeroinitializer, [[TMP15]]) +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = call @foo_vector( zeroinitializer, [[TMP16]]) +; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK3]], [[TMP12]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP19]]) +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD4]], [[TMP20]]) +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP17]], [[TMP21]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI5:%.*]] = select [[TMP16]], [[TMP18]], [[TMP22]] +; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[TMP23]], i64 [[TMP25]] +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP23]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI5]], ptr [[TMP26]], i32 8, [[ACTIVE_LANE_MASK3]]) +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT7]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement [[TMP27]], i32 0 +; TFA_INTERLEAVE-NEXT: br i1 [[TMP28]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TFA_INTERLEAVE: for.cond.cleanup: ; TFA_INTERLEAVE-NEXT: ret void ; @@ -756,38 +747,35 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) ; TFA_INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFA_INTERLEAVE: vector.body: ; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP9]], i64 [[TMP11]] -; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK2]], poison) -; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) -; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD3]], [[ACTIVE_LANE_MASK2]]) -; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i64 [[TMP17]] -; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP13]], ptr [[TMP15]], i32 8, [[ACTIVE_LANE_MASK]]) -; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP14]], ptr [[TMP18]], i32 8, [[ACTIVE_LANE_MASK2]]) -; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]] -; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = add i64 [[INDEX_NEXT]], [[TMP20]] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP21]], i64 1025) -; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement [[TMP22]], i32 0 -; TFA_INTERLEAVE-NEXT: br i1 [[TMP23]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY2]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP7]], i64 [[TMP9]] +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK3]], poison) +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD4]], [[ACTIVE_LANE_MASK3]]) +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 [[TMP15]] +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP11]], ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP12]], ptr [[TMP16]], i32 8, [[ACTIVE_LANE_MASK3]]) +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 +; TFA_INTERLEAVE-NEXT: br i1 [[TMP18]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; TFA_INTERLEAVE: for.cond.cleanup: ; TFA_INTERLEAVE-NEXT: ret void ; @@ -952,51 +940,48 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) ; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[M:%.*]], i64 0 ; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TFA_INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFA_INTERLEAVE: vector.body: ; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]] -; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[TMP9]], i64 [[TMP11]] -; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK2]], poison) -; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fmul [[WIDE_MASKED_LOAD3]], [[BROADCAST_SPLAT]] -; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = fptoui [[WIDE_MASKED_LOAD]] to -; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = fptoui [[WIDE_MASKED_LOAD3]] to -; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = call @foo_vector( [[TMP15]], [[ACTIVE_LANE_MASK]]) -; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = call @foo_vector( [[TMP16]], [[ACTIVE_LANE_MASK2]]) -; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[TMP19]], i64 [[TMP21]] -; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP17]], ptr [[TMP19]], i32 8, [[ACTIVE_LANE_MASK]]) -; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP18]], ptr [[TMP22]], i32 8, [[ACTIVE_LANE_MASK2]]) -; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP23]]) -; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP14]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP26]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[TMP24]], [[TMP25]]) -; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]] -; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = add i64 [[INDEX_NEXT]], [[TMP28]] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP29]], i64 1025) -; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i32 0 -; TFA_INTERLEAVE-NEXT: br i1 [[TMP31]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY2]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[TMP7]], i64 [[TMP9]] +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; TFA_INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK3]], poison) +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = fmul [[WIDE_MASKED_LOAD4]], [[BROADCAST_SPLAT]] +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = fptoui [[WIDE_MASKED_LOAD]] to +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fptoui [[WIDE_MASKED_LOAD4]] to +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = call @foo_vector( [[TMP13]], [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = call @foo_vector( [[TMP14]], [[ACTIVE_LANE_MASK3]]) +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 [[TMP19]] +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP15]], ptr [[TMP17]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP16]], ptr [[TMP20]], i32 8, [[ACTIVE_LANE_MASK3]]) +; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP21]]) +; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = select [[ACTIVE_LANE_MASK3]], [[TMP12]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP24]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[TMP22]], [[TMP23]]) +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement [[TMP25]], i32 0 +; TFA_INTERLEAVE-NEXT: br i1 [[TMP26]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TFA_INTERLEAVE: for.cond.cleanup: -; TFA_INTERLEAVE-NEXT: ret double [[TMP26]] +; TFA_INTERLEAVE-NEXT: ret double [[TMP24]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll index 809d2e8f7ea1a..34283ea296712 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll @@ -16,8 +16,8 @@ define i32 @pr70988() { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], 1 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ult i64 0, [[UMAX]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[UMAX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ult i64 0, [[UMAX]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE5:%.*]] ] @@ -51,8 +51,8 @@ define i32 @pr70988() { ; CHECK-NEXT: [[TMP18]] = select i1 [[ACTIVE_LANE_MASK2]], i32 [[TMP16]], i32 [[VEC_PHI3]] ; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX_NEXT]], 1 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX_NEXT]], [[UMAX]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT7]] = icmp ult i64 [[TMP19]], [[UMAX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX_NEXT]], [[UMAX]] ; CHECK-NEXT: [[TMP20:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT]], true ; CHECK-NEXT: [[TMP21:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT7]], true ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll index bcf8096f1b738..0ae81efdf7d70 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -42,22 +42,22 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP7]] = fadd [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9]] = fadd [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[TMP7]]) +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[TMP9]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-UNORDERED: scalar.ph: @@ -89,18 +89,18 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-ORDERED-NEXT: [[TMP7]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[WIDE_LOAD]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -108,7 +108,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -120,7 +120,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-ORDERED: for.end: -; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] ; ; CHECK-ORDERED-TF-LABEL: define float @fadd_strict @@ -136,27 +136,27 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[N]], [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]] -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP13]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP12]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP16]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP15]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP11]]) ; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -164,7 +164,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -176,7 +176,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[ADD_LCSSA]] ; @@ -230,60 +230,60 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 32 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] -; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 -; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] -; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] -; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 -; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP27]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP30]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP33]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP34]] = fadd [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-UNORDERED-NEXT: [[TMP35]] = fadd [[WIDE_LOAD4]], [[VEC_PHI1]] -; CHECK-UNORDERED-NEXT: [[TMP36]] = fadd [[WIDE_LOAD5]], [[VEC_PHI2]] -; CHECK-UNORDERED-NEXT: [[TMP37]] = fadd [[WIDE_LOAD6]], [[VEC_PHI3]] -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP39]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 24 +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]] +; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]] +; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 24 +; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP26]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP29]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP32]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP35]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP36]] = fadd [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP37]] = fadd [[WIDE_LOAD4]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP38]] = fadd [[WIDE_LOAD5]], [[VEC_PHI2]] +; CHECK-UNORDERED-NEXT: [[TMP39]] = fadd [[WIDE_LOAD6]], [[VEC_PHI3]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd [[TMP35]], [[TMP34]] -; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd [[TMP36]], [[BIN_RDX]] -; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd [[TMP37]], [[BIN_RDX7]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd [[TMP37]], [[TMP36]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd [[TMP38]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd [[TMP39]], [[BIN_RDX7]] ; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX8]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -316,51 +316,51 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 32 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] -; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 -; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] -; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] -; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 -; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP27]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP30]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP33]], align 4 -; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[WIDE_LOAD]]) -; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP34]], [[WIDE_LOAD1]]) -; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP35]], [[WIDE_LOAD2]]) -; CHECK-ORDERED-NEXT: [[TMP37]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP36]], [[WIDE_LOAD3]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP39]] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 24 +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]] +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]] +; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 24 +; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP26]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP29]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP32]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP35]], align 4 +; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP36]], [[WIDE_LOAD1]]) +; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP37]], [[WIDE_LOAD2]]) +; CHECK-ORDERED-NEXT: [[TMP39]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP38]], [[WIDE_LOAD3]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -368,7 +368,7 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -380,7 +380,7 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-ORDERED: for.end: -; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] ; ; CHECK-ORDERED-TF-LABEL: define float @fadd_strict_unroll @@ -396,113 +396,117 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 32 ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[N]], [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP13]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 16 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP15]] +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 24 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 8) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 8) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 0) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP68:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]] -; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY6]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK10:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP55:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], [[TMP22]] +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = add i64 [[TMP25]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], [[TMP27]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[TMP30]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], [[TMP32]] +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP18]] +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP28]] +; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP33]] +; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] ; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]] -; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP61]]) -; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = select [[ACTIVE_LANE_MASK6]], [[WIDE_MASKED_LOAD9]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP62]], [[TMP63]]) -; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = select [[ACTIVE_LANE_MASK7]], [[WIDE_MASKED_LOAD10]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP64]], [[TMP65]]) -; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = select [[ACTIVE_LANE_MASK8]], [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP68]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP66]], [[TMP67]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP70]] -; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = add i64 [[INDEX]], [[TMP72]] -; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], [[TMP75]] -; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = add i64 [[INDEX]], [[TMP78]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP73]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP76]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP79]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = extractelement [[TMP80]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP38]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP41]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP44]], i32 4, [[ACTIVE_LANE_MASK9]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP47]], i32 4, [[ACTIVE_LANE_MASK10]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP48]]) +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = select [[ACTIVE_LANE_MASK8]], [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP49]], [[TMP50]]) +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = select [[ACTIVE_LANE_MASK9]], [[WIDE_MASKED_LOAD12]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP51]], [[TMP52]]) +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = select [[ACTIVE_LANE_MASK10]], [[WIDE_MASKED_LOAD13]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP55]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP53]], [[TMP54]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = add i64 [[INDEX]], [[TMP57]] +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = mul i64 [[TMP59]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], [[TMP60]] +; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = mul i64 [[TMP62]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = add i64 [[INDEX]], [[TMP63]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP61]], i64 [[TMP11]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_NEXT]], i64 8) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_NEXT14]], i64 8) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_NEXT14]], i64 0) +; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = extractelement [[TMP65]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP69]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP85]], [[SUM_07]] +; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP70]], [[SUM_07]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[ADD_LCSSA]] ; @@ -574,37 +578,37 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-UNORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 -; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A2]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A1]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A2]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A1]], i32 0 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; CHECK-UNORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP13]], align 4 ; CHECK-UNORDERED-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8f32( [[WIDE_VEC]]) -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-UNORDERED-NEXT: [[TMP14]] = fadd [[TMP12]], [[VEC_PHI1]] -; CHECK-UNORDERED-NEXT: [[TMP15]] = fadd [[TMP13]], [[VEC_PHI]] -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-UNORDERED-NEXT: [[TMP16]] = fadd [[TMP14]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP17]] = fadd [[TMP15]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP15]]) -; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP14]]) +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP17]]) +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP16]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-UNORDERED: scalar.ph: ; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] -; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] ; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-UNORDERED: for.body: ; CHECK-UNORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] @@ -621,8 +625,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-UNORDERED: for.end: -; CHECK-UNORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] -; CHECK-UNORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; CHECK-UNORDERED-NEXT: store float [[ADD1_LCSSA]], ptr [[A]], align 4 ; CHECK-UNORDERED-NEXT: store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4 ; CHECK-UNORDERED-NEXT: ret void @@ -646,24 +650,24 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-ORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 -; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 -; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP11]], align 4 ; CHECK-ORDERED-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8f32( [[WIDE_VEC]]) -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-ORDERED-NEXT: [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP11]]) -; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP10]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-ORDERED-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) +; CHECK-ORDERED-NEXT: [[TMP15]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP12]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -671,8 +675,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] @@ -689,8 +693,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-ORDERED: for.end: -; CHECK-ORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] -; CHECK-ORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: store float [[ADD1_LCSSA]], ptr [[A]], align 4 ; CHECK-ORDERED-NEXT: store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4 ; CHECK-ORDERED-NEXT: ret void @@ -715,35 +719,35 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 ; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]] -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]] -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = sub i64 [[TMP2]], [[TMP11]] +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[TMP2]], [[TMP11]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP15]] +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0 ; CHECK-ORDERED-TF-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK]]) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP17]], i32 4, [[INTERLEAVED_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8f32( [[WIDE_MASKED_VEC]]) -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP18]]) -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP16]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP20]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]]) +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP19]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP20]]) +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP23]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP22]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]]) ; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -751,8 +755,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] @@ -769,8 +773,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] -; CHECK-ORDERED-TF-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: store float [[ADD1_LCSSA]], ptr [[A]], align 4 ; CHECK-ORDERED-TF-NEXT: store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4 ; CHECK-ORDERED-TF-NEXT: ret void @@ -852,26 +856,26 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] ; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-UNORDERED-NEXT: [[TMP11]] = fadd [[VEC_PHI]], [[TMP10]] -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd [[VEC_PHI]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP11]]) +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP13]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-UNORDERED: scalar.ph: @@ -914,22 +918,22 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] -; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 4 -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] ; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 4 -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-ORDERED-NEXT: [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP10]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP12]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -937,7 +941,7 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -952,7 +956,7 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-ORDERED: for.end.loopexit: -; CHECK-ORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_END]] ; CHECK-ORDERED: for.end: ; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] @@ -976,31 +980,31 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP5]] ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[TMP7]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], [[TMP7]] -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[TMP9]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[N]], [[TMP9]] +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] ; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = fadd [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP16]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP18]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP17]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]]) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP20]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP19]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]]) ; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = extractelement [[TMP21]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -1008,7 +1012,7 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -1023,7 +1027,7 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-ORDERED-TF: for.end.loopexit: -; CHECK-ORDERED-TF-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_END]] ; CHECK-ORDERED-TF: for.end: ; CHECK-ORDERED-TF-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] @@ -1095,28 +1099,28 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 1.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = fcmp une [[WIDE_LOAD]], zeroinitializer -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, [[TMP7]], poison) -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] -; CHECK-UNORDERED-NEXT: [[TMP11]] = fadd [[VEC_PHI]], [[PREDPHI]] -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 1.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = fcmp une [[WIDE_LOAD]], zeroinitializer +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, [[TMP9]], poison) +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd [[VEC_PHI]], [[PREDPHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP11]]) +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP13]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-UNORDERED: scalar.ph: @@ -1156,24 +1160,24 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = fcmp une [[WIDE_LOAD]], zeroinitializer -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, [[TMP7]], poison) -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] -; CHECK-ORDERED-NEXT: [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[PREDPHI]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = fcmp une [[WIDE_LOAD]], zeroinitializer +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, [[TMP9]], poison) +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] +; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[PREDPHI]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -1181,7 +1185,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1201,7 +1205,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-ORDERED: for.end: -; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: ret float [[RDX]] ; ; CHECK-ORDERED-TF-LABEL: define float @fadd_conditional @@ -1217,63 +1221,63 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[N]], [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP10]] -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = fcmp une [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP10]] -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP14]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, [[TMP15]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], zeroinitializer -; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP18]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD1]] -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK]], [[PREDPHI]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP20]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP12]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = fcmp une [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP12]] +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = getelementptr float, ptr [[TMP17]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP18]], i32 4, [[TMP16]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = xor [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP19]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP20]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD1]] +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = select [[ACTIVE_LANE_MASK]], [[PREDPHI]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP22]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP21]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP11]]) +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = extractelement [[TMP23]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; CHECK-ORDERED-TF-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-ORDERED-TF-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP26]], 0.000000e+00 +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP25]], 0.000000e+00 ; CHECK-ORDERED-TF-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK-ORDERED-TF: if.then: ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 ; CHECK-ORDERED-TF-NEXT: br label [[FOR_INC]] ; CHECK-ORDERED-TF: for.inc: -; CHECK-ORDERED-TF-NEXT: [[PHI:%.*]] = phi float [ [[TMP27]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[PHI:%.*]] = phi float [ [[TMP26]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[RDX]] ; @@ -1342,26 +1346,26 @@ define float @fadd_multiple(ptr noalias nocapture %a, ptr noalias nocapture %b, ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float -0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = fadd [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP10]] = fadd [[TMP7]], [[WIDE_LOAD1]] -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float -0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = fadd [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12]] = fadd [[TMP9]], [[WIDE_LOAD1]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[TMP10]]) +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[TMP12]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-UNORDERED: scalar.ph: @@ -1482,78 +1486,78 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 32 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] -; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 -; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] -; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] -; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 -; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP27]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP30]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP33]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] -; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] -; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 -; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] -; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 -; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] -; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 -; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP38]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP41]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP44]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP47]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP48]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD7]], [[VEC_PHI]]) -; CHECK-UNORDERED-NEXT: [[TMP49]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) -; CHECK-UNORDERED-NEXT: [[TMP50]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD5]], [[WIDE_LOAD9]], [[VEC_PHI2]]) -; CHECK-UNORDERED-NEXT: [[TMP51]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD6]], [[WIDE_LOAD10]], [[VEC_PHI3]]) -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP53]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 24 +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]] +; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]] +; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 24 +; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP26]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP29]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP32]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP35]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP16]] +; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP21]] +; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 8 +; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP42]] +; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 16 +; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP45]] +; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP48:%.*]] = mul i64 [[TMP47]], 24 +; CHECK-UNORDERED-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP48]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP40]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP43]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP46]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP49]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP50]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD7]], [[VEC_PHI]]) +; CHECK-UNORDERED-NEXT: [[TMP51]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) +; CHECK-UNORDERED-NEXT: [[TMP52]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD5]], [[WIDE_LOAD9]], [[VEC_PHI2]]) +; CHECK-UNORDERED-NEXT: [[TMP53]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD6]], [[WIDE_LOAD10]], [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-UNORDERED-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd [[TMP49]], [[TMP48]] -; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd [[TMP50]], [[BIN_RDX]] -; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd [[TMP51]], [[BIN_RDX11]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd [[TMP51]], [[TMP50]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd [[TMP52]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd [[TMP53]], [[BIN_RDX11]] ; CHECK-UNORDERED-NEXT: [[TMP55:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX12]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1588,73 +1592,73 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 32 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP55:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] -; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 -; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] -; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] -; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 -; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP27]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP30]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP33]], align 4 -; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] -; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] -; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 -; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 -; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] -; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 -; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] -; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 -; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP38]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP41]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP44]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP47]], align 4 -; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = fmul [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED-NEXT: [[TMP50:%.*]] = fmul [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED-NEXT: [[TMP51:%.*]] = fmul [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED-NEXT: [[TMP52:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP48]]) -; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP52]], [[TMP49]]) -; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP53]], [[TMP50]]) -; CHECK-ORDERED-NEXT: [[TMP55]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], [[TMP51]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP57]] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 24 +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]] +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]] +; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 24 +; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP26]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP29]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP32]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP35]], align 4 +; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] +; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP21]] +; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 8 +; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP42]] +; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 16 +; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP45]] +; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = mul i64 [[TMP47]], 24 +; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP48]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP40]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP43]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP46]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP49]], align 4 +; CHECK-ORDERED-NEXT: [[TMP50:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED-NEXT: [[TMP51:%.*]] = fmul [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED-NEXT: [[TMP52:%.*]] = fmul [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = fmul [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP50]]) +; CHECK-ORDERED-NEXT: [[TMP55:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], [[TMP51]]) +; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP55]], [[TMP52]]) +; CHECK-ORDERED-NEXT: [[TMP57]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP56]], [[TMP53]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-ORDERED-NEXT: [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP58]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -1662,7 +1666,7 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP57]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1676,7 +1680,7 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-ORDERED: for.end: -; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP57]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] ; ; CHECK-ORDERED-TF-LABEL: define float @fmuladd_strict @@ -1692,137 +1696,141 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 32 ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[N]], [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP13]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 16 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP15]] +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 24 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 8) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 8) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 0) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP86:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]] -; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT22:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY6]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT21:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK10:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP73:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], [[TMP22]] +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = add i64 [[TMP25]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], [[TMP27]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[TMP30]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], [[TMP32]] +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP18]] +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP28]] +; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP33]] +; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] ; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]] -; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP67]] -; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP70]] -; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = mul i64 [[TMP72]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP73]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP74]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] -; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = fmul [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] -; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = fmul [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] -; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = fmul [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] -; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP75]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP79]]) -; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = select [[ACTIVE_LANE_MASK6]], [[TMP76]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP80]], [[TMP81]]) -; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP77]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP82]], [[TMP83]]) -; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP78]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP86]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], [[TMP85]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP88]] -; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = add i64 [[INDEX]], [[TMP90]] -; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = mul i64 [[TMP92]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = add i64 [[INDEX]], [[TMP93]] -; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = mul i64 [[TMP95]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP91]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP101:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP102:%.*]] = extractelement [[TMP98]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP102]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP38]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP41]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP44]], i32 4, [[ACTIVE_LANE_MASK9]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP47]], i32 4, [[ACTIVE_LANE_MASK10]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP18]] +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP23]] +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP28]] +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP33]] +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i64 [[TMP54]] +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i64 [[TMP57]] +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = mul i64 [[TMP59]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i64 [[TMP60]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP52]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP55]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP58]], i32 4, [[ACTIVE_LANE_MASK9]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP61]], i32 4, [[ACTIVE_LANE_MASK10]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD14]] +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = fmul [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = fmul [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD16]] +; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = fmul [[WIDE_MASKED_LOAD13]], [[WIDE_MASKED_LOAD17]] +; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP62]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP66]]) +; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP63]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP67]], [[TMP68]]) +; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = select [[ACTIVE_LANE_MASK9]], [[TMP64]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP69]], [[TMP70]]) +; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = select [[ACTIVE_LANE_MASK10]], [[TMP65]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP73]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP71]], [[TMP72]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], [[TMP75]] +; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = add i64 [[INDEX]], [[TMP78]] +; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = mul i64 [[TMP80]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = add i64 [[INDEX]], [[TMP81]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP79]], i64 [[TMP11]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT19]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_NEXT]], i64 8) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT20]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT21]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_NEXT18]], i64 8) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT22]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_NEXT18]], i64 0) +; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP86:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = extractelement [[TMP83]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP87]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP73]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP103:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP104:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP103]], float [[TMP104]], float [[SUM_07]]) +; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP88]], float [[TMP89]], float [[SUM_07]]) ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP73]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[MULADD_LCSSA]] ; @@ -1881,78 +1889,78 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 32 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] -; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 -; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] -; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] -; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 -; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP27]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP30]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP33]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] -; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] -; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 -; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] -; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 -; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] -; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 -; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP38]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP41]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP44]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP47]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP48]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD7]], [[VEC_PHI]]) -; CHECK-UNORDERED-NEXT: [[TMP49]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) -; CHECK-UNORDERED-NEXT: [[TMP50]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD5]], [[WIDE_LOAD9]], [[VEC_PHI2]]) -; CHECK-UNORDERED-NEXT: [[TMP51]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD6]], [[WIDE_LOAD10]], [[VEC_PHI3]]) -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP53]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 24 +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]] +; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]] +; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 24 +; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP26]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP29]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP32]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP35]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP16]] +; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP21]] +; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 8 +; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP42]] +; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 16 +; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP45]] +; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP48:%.*]] = mul i64 [[TMP47]], 24 +; CHECK-UNORDERED-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP48]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP40]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP43]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP46]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP49]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP50]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD7]], [[VEC_PHI]]) +; CHECK-UNORDERED-NEXT: [[TMP51]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) +; CHECK-UNORDERED-NEXT: [[TMP52]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD5]], [[WIDE_LOAD9]], [[VEC_PHI2]]) +; CHECK-UNORDERED-NEXT: [[TMP53]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD6]], [[WIDE_LOAD10]], [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-UNORDERED-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd nnan [[TMP49]], [[TMP48]] -; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd nnan [[TMP50]], [[BIN_RDX]] -; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd nnan [[TMP51]], [[BIN_RDX11]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd nnan [[TMP51]], [[TMP50]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd nnan [[TMP52]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd nnan [[TMP53]], [[BIN_RDX11]] ; CHECK-UNORDERED-NEXT: [[TMP55:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX12]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1987,73 +1995,73 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 32 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP55:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] -; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 -; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] -; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] -; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 -; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP27]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP30]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP33]], align 4 -; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] -; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] -; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 -; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 -; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] -; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 -; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] -; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 -; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP38]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP41]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP44]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP47]], align 4 -; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = fmul nnan [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = fmul nnan [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED-NEXT: [[TMP50:%.*]] = fmul nnan [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED-NEXT: [[TMP51:%.*]] = fmul nnan [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED-NEXT: [[TMP52:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP48]]) -; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP52]], [[TMP49]]) -; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP53]], [[TMP50]]) -; CHECK-ORDERED-NEXT: [[TMP55]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], [[TMP51]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP57]] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 24 +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]] +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]] +; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 24 +; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP26]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP29]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP32]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP35]], align 4 +; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] +; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP21]] +; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 8 +; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP42]] +; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 16 +; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP45]] +; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = mul i64 [[TMP47]], 24 +; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP48]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP40]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP43]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP46]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP49]], align 4 +; CHECK-ORDERED-NEXT: [[TMP50:%.*]] = fmul nnan [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED-NEXT: [[TMP51:%.*]] = fmul nnan [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED-NEXT: [[TMP52:%.*]] = fmul nnan [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = fmul nnan [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP50]]) +; CHECK-ORDERED-NEXT: [[TMP55:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], [[TMP51]]) +; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP55]], [[TMP52]]) +; CHECK-ORDERED-NEXT: [[TMP57]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP56]], [[TMP53]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-ORDERED-NEXT: [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP58]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -2061,7 +2069,7 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP57]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -2075,7 +2083,7 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-ORDERED: for.end: -; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP57]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] ; ; CHECK-ORDERED-TF-LABEL: define float @fmuladd_strict_fmf @@ -2091,137 +2099,141 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 32 ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 32 +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[N]], [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0 +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP13]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 16 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP15]] +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 24 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 8) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 8) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 0) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP86:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]] -; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT22:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY6]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT21:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK10:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP73:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], [[TMP22]] +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = add i64 [[TMP25]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], [[TMP27]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[TMP30]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], [[TMP32]] +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP18]] +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP28]] +; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP33]] +; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] ; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]] -; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP67]] -; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP70]] -; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = mul i64 [[TMP72]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP73]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP74]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = fmul nnan [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] -; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = fmul nnan [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] -; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = fmul nnan [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] -; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = fmul nnan [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] -; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = select nnan [[ACTIVE_LANE_MASK]], [[TMP75]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP79]]) -; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = select nnan [[ACTIVE_LANE_MASK6]], [[TMP76]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP80]], [[TMP81]]) -; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = select nnan [[ACTIVE_LANE_MASK7]], [[TMP77]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP82]], [[TMP83]]) -; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select nnan [[ACTIVE_LANE_MASK8]], [[TMP78]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP86]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], [[TMP85]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP88]] -; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = add i64 [[INDEX]], [[TMP90]] -; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = mul i64 [[TMP92]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = add i64 [[INDEX]], [[TMP93]] -; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = mul i64 [[TMP95]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP91]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP101:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP102:%.*]] = extractelement [[TMP98]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP102]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP38]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP41]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP44]], i32 4, [[ACTIVE_LANE_MASK9]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP47]], i32 4, [[ACTIVE_LANE_MASK10]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP18]] +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP23]] +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP28]] +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP33]] +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i64 [[TMP54]] +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i64 [[TMP57]] +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = mul i64 [[TMP59]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i64 [[TMP60]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP52]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP55]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP58]], i32 4, [[ACTIVE_LANE_MASK9]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP61]], i32 4, [[ACTIVE_LANE_MASK10]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = fmul nnan [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD14]] +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = fmul nnan [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = fmul nnan [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD16]] +; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = fmul nnan [[WIDE_MASKED_LOAD13]], [[WIDE_MASKED_LOAD17]] +; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = select nnan [[ACTIVE_LANE_MASK]], [[TMP62]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP66]]) +; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = select nnan [[ACTIVE_LANE_MASK8]], [[TMP63]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP67]], [[TMP68]]) +; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = select nnan [[ACTIVE_LANE_MASK9]], [[TMP64]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP69]], [[TMP70]]) +; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = select nnan [[ACTIVE_LANE_MASK10]], [[TMP65]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP73]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP71]], [[TMP72]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], [[TMP75]] +; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = add i64 [[INDEX]], [[TMP78]] +; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = mul i64 [[TMP80]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = add i64 [[INDEX]], [[TMP81]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP79]], i64 [[TMP11]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT19]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_NEXT]], i64 8) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT20]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT21]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_NEXT18]], i64 8) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT22]] = call @llvm.vector.extract.nxv8i1.nxv16i1( [[ACTIVE_LANE_MASK_NEXT18]], i64 0) +; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP86:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = extractelement [[TMP83]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP87]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP73]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP103:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP104:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP103]], float [[TMP104]], float [[SUM_07]]) +; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP88]], float [[TMP89]], float [[SUM_07]]) ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP73]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[MULADD_LCSSA]] ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll index 2acc1ddbffea4..0c1b587c95259 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -18,87 +18,91 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP11]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP11]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 -; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] ; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 -; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] +; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP28]] ; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 12 -; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP30]] +; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP32]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 4) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 4) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 0) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX6]], 0 -; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 -; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[INDEX6]], [[TMP35]] -; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 8 -; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[INDEX6]], [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 12 -; CHECK-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX6]], [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP31]] -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP36]] -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP41]] -; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP46]] -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0 -; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 4 -; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 8 -; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 12 -; CHECK-NEXT: [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK7]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK8]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK9]]) -; CHECK-NEXT: [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]] +; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY6]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK10:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK11:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[INDEX8]], 0 +; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4 +; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[TMP35]], 0 +; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 1 +; CHECK-NEXT: [[TMP38:%.*]] = add i64 [[INDEX8]], [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 +; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[TMP40]], 0 +; CHECK-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 1 +; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[INDEX8]], [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 12 +; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[TMP45]], 0 +; CHECK-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 1 +; CHECK-NEXT: [[TMP48:%.*]] = add i64 [[INDEX8]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP38]] +; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP43]] +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP48]] +; CHECK-NEXT: [[TMP53:%.*]] = getelementptr i32, ptr [[TMP49]], i32 0 +; CHECK-NEXT: [[TMP54:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP54]], 4 +; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP55]] +; CHECK-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 8 +; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP58]] +; CHECK-NEXT: [[TMP60:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 12 +; CHECK-NEXT: [[TMP62:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP61]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP53]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP56]], i32 4, [[ACTIVE_LANE_MASK9]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP59]], i32 4, [[ACTIVE_LANE_MASK10]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP62]], i32 4, [[ACTIVE_LANE_MASK11]]) +; CHECK-NEXT: [[INDEX_NEXT12]] = add i64 [[INDEX8]], [[TMP6]] ; CHECK-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP64:%.*]] = mul i64 [[TMP63]], 4 -; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX6]], [[TMP64]] +; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX8]], [[TMP64]] ; CHECK-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 8 -; CHECK-NEXT: [[TMP68:%.*]] = add i64 [[INDEX6]], [[TMP67]] +; CHECK-NEXT: [[TMP68:%.*]] = add i64 [[INDEX8]], [[TMP67]] ; CHECK-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 12 -; CHECK-NEXT: [[TMP71:%.*]] = add i64 [[INDEX6]], [[TMP70]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP9]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT11]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP65]], i64 [[TMP9]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP68]], i64 [[TMP9]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP71]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP72:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP73:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP74:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP75:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP71:%.*]] = add i64 [[INDEX8]], [[TMP70]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP68]], i64 [[TMP9]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT13:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX8]], i64 [[TMP9]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 4) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT13]], i64 4) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT13]], i64 0) +; CHECK-NEXT: [[TMP72:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP73:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP74:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP75:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP76:%.*]] = extractelement [[TMP72]], i32 0 ; CHECK-NEXT: br i1 [[TMP76]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -145,113 +149,117 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP83:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP84:%.*]] = mul i64 [[TMP83]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP11]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP11]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 -; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] ; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 -; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] +; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP28]] ; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 12 -; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP30]] +; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP32]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 4) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 4) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 0) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX6]], 0 -; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 -; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[INDEX6]], [[TMP35]] -; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 8 -; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[INDEX6]], [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 12 -; CHECK-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX6]], [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, ptr [[COND_PTR:%.*]], i64 [[TMP31]] -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP36]] -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP41]] -; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP46]] -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0 -; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 4 -; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 8 -; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 12 -; CHECK-NEXT: [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK9]], poison) -; CHECK-NEXT: [[TMP61:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP62:%.*]] = icmp ne [[WIDE_MASKED_LOAD10]], zeroinitializer -; CHECK-NEXT: [[TMP63:%.*]] = icmp ne [[WIDE_MASKED_LOAD11]], zeroinitializer +; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY7]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY6]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK10:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK11:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[INDEX8]], 0 +; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4 +; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[TMP35]], 0 +; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 1 +; CHECK-NEXT: [[TMP38:%.*]] = add i64 [[INDEX8]], [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 +; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[TMP40]], 0 +; CHECK-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 1 +; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[INDEX8]], [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 12 +; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[TMP45]], 0 +; CHECK-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 1 +; CHECK-NEXT: [[TMP48:%.*]] = add i64 [[INDEX8]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, ptr [[COND_PTR:%.*]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP38]] +; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP43]] +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP48]] +; CHECK-NEXT: [[TMP53:%.*]] = getelementptr i32, ptr [[TMP49]], i32 0 +; CHECK-NEXT: [[TMP54:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP54]], 4 +; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP55]] +; CHECK-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 8 +; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP58]] +; CHECK-NEXT: [[TMP60:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 12 +; CHECK-NEXT: [[TMP62:%.*]] = getelementptr i32, ptr [[TMP49]], i64 [[TMP61]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP53]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP56]], i32 4, [[ACTIVE_LANE_MASK9]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP59]], i32 4, [[ACTIVE_LANE_MASK10]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP62]], i32 4, [[ACTIVE_LANE_MASK11]], poison) +; CHECK-NEXT: [[TMP63:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP64:%.*]] = icmp ne [[WIDE_MASKED_LOAD12]], zeroinitializer -; CHECK-NEXT: [[TMP69:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP61]], zeroinitializer -; CHECK-NEXT: [[TMP70:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP62]], zeroinitializer -; CHECK-NEXT: [[TMP71:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP63]], zeroinitializer -; CHECK-NEXT: [[TMP72:%.*]] = select [[ACTIVE_LANE_MASK9]], [[TMP64]], zeroinitializer -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP31]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP36]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP41]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP46]] -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0 -; CHECK-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 4 -; CHECK-NEXT: [[TMP76:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP75]] -; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 8 -; CHECK-NEXT: [[TMP79:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP78]] -; CHECK-NEXT: [[TMP80:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP81:%.*]] = mul i64 [[TMP80]], 12 -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP81]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP73]], i32 4, [[TMP69]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, [[TMP70]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP79]], i32 4, [[TMP71]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, [[TMP72]]) -; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP84]] +; CHECK-NEXT: [[TMP65:%.*]] = icmp ne [[WIDE_MASKED_LOAD13]], zeroinitializer +; CHECK-NEXT: [[TMP66:%.*]] = icmp ne [[WIDE_MASKED_LOAD14]], zeroinitializer +; CHECK-NEXT: [[TMP67:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP63]], zeroinitializer +; CHECK-NEXT: [[TMP68:%.*]] = select [[ACTIVE_LANE_MASK9]], [[TMP64]], zeroinitializer +; CHECK-NEXT: [[TMP69:%.*]] = select [[ACTIVE_LANE_MASK10]], [[TMP65]], zeroinitializer +; CHECK-NEXT: [[TMP70:%.*]] = select [[ACTIVE_LANE_MASK11]], [[TMP66]], zeroinitializer +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP38]] +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP43]] +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP48]] +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, ptr [[TMP71]], i32 0 +; CHECK-NEXT: [[TMP76:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP77:%.*]] = mul i64 [[TMP76]], 4 +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr i32, ptr [[TMP71]], i64 [[TMP77]] +; CHECK-NEXT: [[TMP79:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP80:%.*]] = mul i64 [[TMP79]], 8 +; CHECK-NEXT: [[TMP81:%.*]] = getelementptr i32, ptr [[TMP71]], i64 [[TMP80]] +; CHECK-NEXT: [[TMP82:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP83:%.*]] = mul i64 [[TMP82]], 12 +; CHECK-NEXT: [[TMP84:%.*]] = getelementptr i32, ptr [[TMP71]], i64 [[TMP83]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP75]], i32 4, [[TMP67]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP78]], i32 4, [[TMP68]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP81]], i32 4, [[TMP69]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP84]], i32 4, [[TMP70]]) +; CHECK-NEXT: [[INDEX_NEXT15]] = add i64 [[INDEX8]], [[TMP6]] ; CHECK-NEXT: [[TMP85:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP86:%.*]] = mul i64 [[TMP85]], 4 -; CHECK-NEXT: [[TMP87:%.*]] = add i64 [[INDEX6]], [[TMP86]] +; CHECK-NEXT: [[TMP87:%.*]] = add i64 [[INDEX8]], [[TMP86]] ; CHECK-NEXT: [[TMP88:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP89:%.*]] = mul i64 [[TMP88]], 8 -; CHECK-NEXT: [[TMP90:%.*]] = add i64 [[INDEX6]], [[TMP89]] +; CHECK-NEXT: [[TMP90:%.*]] = add i64 [[INDEX8]], [[TMP89]] ; CHECK-NEXT: [[TMP91:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP92:%.*]] = mul i64 [[TMP91]], 12 -; CHECK-NEXT: [[TMP93:%.*]] = add i64 [[INDEX6]], [[TMP92]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP9]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP87]], i64 [[TMP9]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP90]], i64 [[TMP9]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP94:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP95:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP96:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP97:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP93:%.*]] = add i64 [[INDEX8]], [[TMP92]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP90]], i64 [[TMP9]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX8]], i64 [[TMP9]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 4) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT19]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT16]], i64 4) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT20]] = call @llvm.vector.extract.nxv4i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT16]], i64 0) +; CHECK-NEXT: [[TMP94:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP95:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP96:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP97:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP98:%.*]] = extractelement [[TMP94]], i32 0 ; CHECK-NEXT: br i1 [[TMP98]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll new file mode 100644 index 0000000000000..f766cfeefb0d8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll @@ -0,0 +1,656 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S --passes="default" -sve-tail-folding-insn-threshold=0 -force-vector-interleave=0 < %s | FileCheck %s -check-prefix CHECK-SVE-UF0 +; RUN: opt -S --passes="default" -sve-tail-folding-insn-threshold=0 -force-vector-interleave=2 < %s | FileCheck %s -check-prefix CHECK-SVE-UF2 +; RUN: opt -S --passes="default" -sve-tail-folding-insn-threshold=0 -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-SVE-UF4 +; RUN: opt -S --passes="default" -sve-tail-folding-insn-threshold=0 -force-vector-interleave=0 -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1-UF0 +; RUN: opt -S --passes="default" -sve-tail-folding-insn-threshold=0 -force-vector-interleave=2 -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1-UF2 +; RUN: opt -S --passes="default" -sve-tail-folding-insn-threshold=0 -force-vector-interleave=4 -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1-UF4 + +target triple = "aarch64-unknown-linux" + +define void @f0(ptr noalias %dst, ptr readonly %src, i32 %n) #0 { +; CHECK-SVE-UF0-LABEL: define void @f0( +; CHECK-SVE-UF0-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SVE-UF0-NEXT: entry: +; CHECK-SVE-UF0-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE-UF0-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF0: for.body.preheader: +; CHECK-SVE-UF0-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE-UF0-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF0-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF0-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF0: vector.body: +; CHECK-SVE-UF0-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF0-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF0-NEXT: [[TMP3:%.*]] = mul [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF0-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF0-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP3]], ptr [[TMP4]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF0-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF0-NEXT: [[TMP5:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-SVE-UF0-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE-UF0: for.cond.cleanup: +; CHECK-SVE-UF0-NEXT: ret void +; +; CHECK-SVE-UF2-LABEL: define void @f0( +; CHECK-SVE-UF2-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SVE-UF2-NEXT: entry: +; CHECK-SVE-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE-UF2-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF2: for.body.preheader: +; CHECK-SVE-UF2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE-UF2-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 5 +; CHECK-SVE-UF2-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4 +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP3]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF2-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; CHECK-SVE-UF2-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 4 +; CHECK-SVE-UF2-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 4 +; CHECK-SVE-UF2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF2: vector.body: +; CHECK-SVE-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP5]] +; CHECK-SVE-UF2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF2-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP11]], i32 1, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-SVE-UF2-NEXT: [[TMP12:%.*]] = mul [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF2-NEXT: [[TMP13:%.*]] = mul [[WIDE_MASKED_LOAD3]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 [[TMP7]] +; CHECK-SVE-UF2-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP12]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF2-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP13]], ptr [[TMP15]], i32 1, [[ACTIVE_LANE_MASK2]]) +; CHECK-SVE-UF2-NEXT: [[INDEX_NEXT4]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE-UF2-NEXT: [[TMP16:%.*]] = add i64 [[INDEX_NEXT4]], [[TMP9]] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP16]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT4]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF2-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT5]], i64 0 +; CHECK-SVE-UF2-NEXT: br i1 [[TMP17]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE-UF2: for.cond.cleanup: +; CHECK-SVE-UF2-NEXT: ret void +; +; CHECK-SVE-UF4-LABEL: define void @f0( +; CHECK-SVE-UF4-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SVE-UF4-NEXT: entry: +; CHECK-SVE-UF4-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE-UF4-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF4: for.body.preheader: +; CHECK-SVE-UF4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE-UF4-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 6 +; CHECK-SVE-UF4-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4 +; CHECK-SVE-UF4-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 5 +; CHECK-SVE-UF4-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP7:%.*]] = mul nuw nsw i64 [[TMP6]], 48 +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP7]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP5]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP3]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 4 +; CHECK-SVE-UF4-NEXT: [[TMP10:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 5 +; CHECK-SVE-UF4-NEXT: [[TMP12:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP13:%.*]] = mul nuw nsw i64 [[TMP12]], 48 +; CHECK-SVE-UF4-NEXT: [[TMP14:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 4 +; CHECK-SVE-UF4-NEXT: [[TMP16:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 5 +; CHECK-SVE-UF4-NEXT: [[TMP18:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP19:%.*]] = mul nuw nsw i64 [[TMP18]], 48 +; CHECK-SVE-UF4-NEXT: [[TMP20:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP21:%.*]] = shl nuw nsw i64 [[TMP20]], 4 +; CHECK-SVE-UF4-NEXT: [[TMP22:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP23:%.*]] = shl nuw nsw i64 [[TMP22]], 5 +; CHECK-SVE-UF4-NEXT: [[TMP24:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP25:%.*]] = mul nuw nsw i64 [[TMP24]], 48 +; CHECK-SVE-UF4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF4: vector.body: +; CHECK-SVE-UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT14:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[TMP9]] +; CHECK-SVE-UF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[TMP11]] +; CHECK-SVE-UF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[TMP13]] +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP26]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP27]], i32 1, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP28]], i32 1, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP29]], i32 1, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-SVE-UF4-NEXT: [[TMP30:%.*]] = mul [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF4-NEXT: [[TMP31:%.*]] = mul [[WIDE_MASKED_LOAD9]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF4-NEXT: [[TMP32:%.*]] = mul [[WIDE_MASKED_LOAD10]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF4-NEXT: [[TMP33:%.*]] = mul [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF4-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF4-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i64 [[TMP15]] +; CHECK-SVE-UF4-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i64 [[TMP17]] +; CHECK-SVE-UF4-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i64 [[TMP19]] +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP30]], ptr [[TMP34]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP31]], ptr [[TMP35]], i32 1, [[ACTIVE_LANE_MASK6]]) +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP32]], ptr [[TMP36]], i32 1, [[ACTIVE_LANE_MASK7]]) +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP33]], ptr [[TMP37]], i32 1, [[ACTIVE_LANE_MASK8]]) +; CHECK-SVE-UF4-NEXT: [[INDEX_NEXT14]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE-UF4-NEXT: [[TMP38:%.*]] = add i64 [[INDEX_NEXT14]], [[TMP21]] +; CHECK-SVE-UF4-NEXT: [[TMP39:%.*]] = add i64 [[INDEX_NEXT14]], [[TMP23]] +; CHECK-SVE-UF4-NEXT: [[TMP40:%.*]] = add i64 [[INDEX_NEXT14]], [[TMP25]] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP40]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP39]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP38]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT14]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[TMP41:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT17]], i64 0 +; CHECK-SVE-UF4-NEXT: br i1 [[TMP41]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE-UF4: for.cond.cleanup: +; CHECK-SVE-UF4-NEXT: ret void +; +; CHECK-SVE2p1-UF0-LABEL: define void @f0( +; CHECK-SVE2p1-UF0-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SVE2p1-UF0-NEXT: entry: +; CHECK-SVE2p1-UF0-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE2p1-UF0-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE2p1-UF0: for.body.preheader: +; CHECK-SVE2p1-UF0-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE2p1-UF0-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF0-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-SVE2p1-UF0-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF0-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE2p1-UF0: vector.body: +; CHECK-SVE2p1-UF0-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF0-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE2p1-UF0-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE2p1-UF0-NEXT: [[TMP3:%.*]] = mul [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF0-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE2p1-UF0-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP3]], ptr [[TMP4]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE2p1-UF0-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE2p1-UF0-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF0-NEXT: [[TMP5:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-SVE2p1-UF0-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE2p1-UF0: for.cond.cleanup: +; CHECK-SVE2p1-UF0-NEXT: ret void +; +; CHECK-SVE2p1-UF2-LABEL: define void @f0( +; CHECK-SVE2p1-UF2-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SVE2p1-UF2-NEXT: entry: +; CHECK-SVE2p1-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE2p1-UF2-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE2p1-UF2: for.body.preheader: +; CHECK-SVE2p1-UF2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE2p1-UF2-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF2-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 5 +; CHECK-SVE2p1-UF2-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF2-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4 +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP3]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF2-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF2-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; CHECK-SVE2p1-UF2-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF2-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 4 +; CHECK-SVE2p1-UF2-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF2-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 4 +; CHECK-SVE2p1-UF2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE2p1-UF2: vector.body: +; CHECK-SVE2p1-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE2p1-UF2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP5]] +; CHECK-SVE2p1-UF2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE2p1-UF2-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP11]], i32 1, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-SVE2p1-UF2-NEXT: [[TMP12:%.*]] = mul [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF2-NEXT: [[TMP13:%.*]] = mul [[WIDE_MASKED_LOAD3]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE2p1-UF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 [[TMP7]] +; CHECK-SVE2p1-UF2-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP12]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE2p1-UF2-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP13]], ptr [[TMP15]], i32 1, [[ACTIVE_LANE_MASK2]]) +; CHECK-SVE2p1-UF2-NEXT: [[INDEX_NEXT4]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE2p1-UF2-NEXT: [[TMP16:%.*]] = add i64 [[INDEX_NEXT4]], [[TMP9]] +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP16]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT4]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF2-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT5]], i64 0 +; CHECK-SVE2p1-UF2-NEXT: br i1 [[TMP17]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE2p1-UF2: for.cond.cleanup: +; CHECK-SVE2p1-UF2-NEXT: ret void +; +; CHECK-SVE2p1-UF4-LABEL: define void @f0( +; CHECK-SVE2p1-UF4-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SVE2p1-UF4-NEXT: entry: +; CHECK-SVE2p1-UF4-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE2p1-UF4-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE2p1-UF4: for.body.preheader: +; CHECK-SVE2p1-UF4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE2p1-UF4-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 6 +; CHECK-SVE2p1-UF4-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4 +; CHECK-SVE2p1-UF4-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 5 +; CHECK-SVE2p1-UF4-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP7:%.*]] = mul nuw nsw i64 [[TMP6]], 48 +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP7]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP5]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP3]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF4-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 4 +; CHECK-SVE2p1-UF4-NEXT: [[TMP10:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 5 +; CHECK-SVE2p1-UF4-NEXT: [[TMP12:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP13:%.*]] = mul nuw nsw i64 [[TMP12]], 48 +; CHECK-SVE2p1-UF4-NEXT: [[TMP14:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 4 +; CHECK-SVE2p1-UF4-NEXT: [[TMP16:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 5 +; CHECK-SVE2p1-UF4-NEXT: [[TMP18:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP19:%.*]] = mul nuw nsw i64 [[TMP18]], 48 +; CHECK-SVE2p1-UF4-NEXT: [[TMP20:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP21:%.*]] = shl nuw nsw i64 [[TMP20]], 4 +; CHECK-SVE2p1-UF4-NEXT: [[TMP22:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP23:%.*]] = shl nuw nsw i64 [[TMP22]], 5 +; CHECK-SVE2p1-UF4-NEXT: [[TMP24:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP25:%.*]] = mul nuw nsw i64 [[TMP24]], 48 +; CHECK-SVE2p1-UF4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE2p1-UF4: vector.body: +; CHECK-SVE2p1-UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT14:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[TMP9]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[TMP11]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[TMP13]] +; CHECK-SVE2p1-UF4-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP26]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE2p1-UF4-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP27]], i32 1, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-SVE2p1-UF4-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP28]], i32 1, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-SVE2p1-UF4-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP29]], i32 1, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-SVE2p1-UF4-NEXT: [[TMP30:%.*]] = mul [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF4-NEXT: [[TMP31:%.*]] = mul [[WIDE_MASKED_LOAD9]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF4-NEXT: [[TMP32:%.*]] = mul [[WIDE_MASKED_LOAD10]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF4-NEXT: [[TMP33:%.*]] = mul [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, i8 3, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF4-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i64 [[TMP15]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i64 [[TMP17]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i64 [[TMP19]] +; CHECK-SVE2p1-UF4-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP30]], ptr [[TMP34]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE2p1-UF4-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP31]], ptr [[TMP35]], i32 1, [[ACTIVE_LANE_MASK6]]) +; CHECK-SVE2p1-UF4-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP32]], ptr [[TMP36]], i32 1, [[ACTIVE_LANE_MASK7]]) +; CHECK-SVE2p1-UF4-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP33]], ptr [[TMP37]], i32 1, [[ACTIVE_LANE_MASK8]]) +; CHECK-SVE2p1-UF4-NEXT: [[INDEX_NEXT14]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP38:%.*]] = add i64 [[INDEX_NEXT14]], [[TMP21]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP39:%.*]] = add i64 [[INDEX_NEXT14]], [[TMP23]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP40:%.*]] = add i64 [[INDEX_NEXT14]], [[TMP25]] +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP40]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP39]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP38]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT14]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF4-NEXT: [[TMP41:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT17]], i64 0 +; CHECK-SVE2p1-UF4-NEXT: br i1 [[TMP41]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE2p1-UF4: for.cond.cleanup: +; CHECK-SVE2p1-UF4-NEXT: ret void +; +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv + %0 = load i8, ptr %arrayidx, align 1 + %mul = mul i8 %0, 3 + %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv + store i8 %mul, ptr %arrayidx3, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define void @f1(ptr noalias %dst, ptr readonly %src, i32 %n) #0 { +; CHECK-SVE-UF0-LABEL: define void @f1( +; CHECK-SVE-UF0-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-SVE-UF0-NEXT: entry: +; CHECK-SVE-UF0-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE-UF0-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF0: for.body.preheader: +; CHECK-SVE-UF0-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE-UF0-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF0-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF0-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF0: vector.body: +; CHECK-SVE-UF0-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF0-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF0-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr [[TMP2]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF0-NEXT: [[TMP3:%.*]] = fmul [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF0-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF0-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP3]], ptr [[TMP4]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF0-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF0-NEXT: [[TMP5:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-SVE-UF0-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-SVE-UF0: for.cond.cleanup: +; CHECK-SVE-UF0-NEXT: ret void +; +; CHECK-SVE-UF2-LABEL: define void @f1( +; CHECK-SVE-UF2-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-SVE-UF2-NEXT: entry: +; CHECK-SVE-UF2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE-UF2-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF2: for.body.preheader: +; CHECK-SVE-UF2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE-UF2-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-SVE-UF2-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1 +; CHECK-SVE-UF2-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1 +; CHECK-SVE-UF2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF2: vector.body: +; CHECK-SVE-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT5:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY2]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT7:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP3]] +; CHECK-SVE-UF2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr [[TMP6]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP7]], i32 8, [[ACTIVE_LANE_MASK3]], poison) +; CHECK-SVE-UF2-NEXT: [[TMP8:%.*]] = fmul [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF2-NEXT: [[TMP9:%.*]] = fmul [[WIDE_MASKED_LOAD4]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF2-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[TMP10]], i64 [[TMP5]] +; CHECK-SVE-UF2-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP8]], ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF2-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP9]], ptr [[TMP11]], i32 8, [[ACTIVE_LANE_MASK3]]) +; CHECK-SVE-UF2-NEXT: [[INDEX_NEXT5]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT5]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT7]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-SVE-UF2-NEXT: [[TMP12:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT7]], i64 0 +; CHECK-SVE-UF2-NEXT: br i1 [[TMP12]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-SVE-UF2: for.cond.cleanup: +; CHECK-SVE-UF2-NEXT: ret void +; +; CHECK-SVE-UF4-LABEL: define void @f1( +; CHECK-SVE-UF4-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-SVE-UF4-NEXT: entry: +; CHECK-SVE-UF4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE-UF4-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF4: for.body.preheader: +; CHECK-SVE-UF4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE-UF4-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-SVE-UF4-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP3]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 2) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 0) +; CHECK-SVE-UF4-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1 +; CHECK-SVE-UF4-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-SVE-UF4-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP9:%.*]] = mul nuw nsw i64 [[TMP8]], 6 +; CHECK-SVE-UF4-NEXT: [[TMP10:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 1 +; CHECK-SVE-UF4-NEXT: [[TMP12:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 +; CHECK-SVE-UF4-NEXT: [[TMP14:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP15:%.*]] = mul nuw nsw i64 [[TMP14]], 6 +; CHECK-SVE-UF4-NEXT: [[TMP16:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-SVE-UF4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF4: vector.body: +; CHECK-SVE-UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY7]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY6]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK10:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 [[TMP5]] +; CHECK-SVE-UF4-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 [[TMP7]] +; CHECK-SVE-UF4-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 [[TMP9]] +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr [[TMP18]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP19]], i32 8, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP20]], i32 8, [[ACTIVE_LANE_MASK9]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP21]], i32 8, [[ACTIVE_LANE_MASK10]], poison) +; CHECK-SVE-UF4-NEXT: [[TMP22:%.*]] = fmul [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF4-NEXT: [[TMP23:%.*]] = fmul [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF4-NEXT: [[TMP24:%.*]] = fmul [[WIDE_MASKED_LOAD12]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF4-NEXT: [[TMP25:%.*]] = fmul [[WIDE_MASKED_LOAD13]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE-UF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i64 [[TMP11]] +; CHECK-SVE-UF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i64 [[TMP13]] +; CHECK-SVE-UF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i64 [[TMP15]] +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP22]], ptr [[TMP26]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP23]], ptr [[TMP27]], i32 8, [[ACTIVE_LANE_MASK8]]) +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP24]], ptr [[TMP28]], i32 8, [[ACTIVE_LANE_MASK9]]) +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP25]], ptr [[TMP29]], i32 8, [[ACTIVE_LANE_MASK10]]) +; CHECK-SVE-UF4-NEXT: [[INDEX_NEXT16]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE-UF4-NEXT: [[TMP30:%.*]] = add i64 [[INDEX_NEXT16]], [[TMP17]] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP30]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT17:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT16]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT19]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT20]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT17]], i64 2) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT21]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT17]], i64 0) +; CHECK-SVE-UF4-NEXT: [[TMP31:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT21]], i64 0 +; CHECK-SVE-UF4-NEXT: br i1 [[TMP31]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-SVE-UF4: for.cond.cleanup: +; CHECK-SVE-UF4-NEXT: ret void +; +; CHECK-SVE2p1-UF0-LABEL: define void @f1( +; CHECK-SVE2p1-UF0-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-SVE2p1-UF0-NEXT: entry: +; CHECK-SVE2p1-UF0-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE2p1-UF0-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE2p1-UF0: for.body.preheader: +; CHECK-SVE2p1-UF0-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE2p1-UF0-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF0-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-SVE2p1-UF0-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF0-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE2p1-UF0: vector.body: +; CHECK-SVE2p1-UF0-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF0-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF0-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE2p1-UF0-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr [[TMP2]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE2p1-UF0-NEXT: [[TMP3:%.*]] = fmul [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF0-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE2p1-UF0-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP3]], ptr [[TMP4]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE2p1-UF0-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE2p1-UF0-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF0-NEXT: [[TMP5:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-SVE2p1-UF0-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-SVE2p1-UF0: for.cond.cleanup: +; CHECK-SVE2p1-UF0-NEXT: ret void +; +; CHECK-SVE2p1-UF2-LABEL: define void @f1( +; CHECK-SVE2p1-UF2-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-SVE2p1-UF2-NEXT: entry: +; CHECK-SVE2p1-UF2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE2p1-UF2-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE2p1-UF2: for.body.preheader: +; CHECK-SVE2p1-UF2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE2p1-UF2-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF2-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-SVE2p1-UF2-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF2-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1 +; CHECK-SVE2p1-UF2-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF2-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1 +; CHECK-SVE2p1-UF2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE2p1-UF2: vector.body: +; CHECK-SVE2p1-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT5:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY2]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT7:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE2p1-UF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP3]] +; CHECK-SVE2p1-UF2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr [[TMP6]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE2p1-UF2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP7]], i32 8, [[ACTIVE_LANE_MASK3]], poison) +; CHECK-SVE2p1-UF2-NEXT: [[TMP8:%.*]] = fmul [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF2-NEXT: [[TMP9:%.*]] = fmul [[WIDE_MASKED_LOAD4]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE2p1-UF2-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[TMP10]], i64 [[TMP5]] +; CHECK-SVE2p1-UF2-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP8]], ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE2p1-UF2-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP9]], ptr [[TMP11]], i32 8, [[ACTIVE_LANE_MASK3]]) +; CHECK-SVE2p1-UF2-NEXT: [[INDEX_NEXT5]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT5]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; CHECK-SVE2p1-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT7]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-SVE2p1-UF2-NEXT: [[TMP12:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT7]], i64 0 +; CHECK-SVE2p1-UF2-NEXT: br i1 [[TMP12]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-SVE2p1-UF2: for.cond.cleanup: +; CHECK-SVE2p1-UF2-NEXT: ret void +; +; CHECK-SVE2p1-UF4-LABEL: define void @f1( +; CHECK-SVE2p1-UF4-SAME: ptr noalias nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-SVE2p1-UF4-NEXT: entry: +; CHECK-SVE2p1-UF4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE2p1-UF4-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE2p1-UF4: for.body.preheader: +; CHECK-SVE2p1-UF4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE2p1-UF4-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-SVE2p1-UF4-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP3]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY6:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 2) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY7:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY3]], i64 0) +; CHECK-SVE2p1-UF4-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1 +; CHECK-SVE2p1-UF4-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-SVE2p1-UF4-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP9:%.*]] = mul nuw nsw i64 [[TMP8]], 6 +; CHECK-SVE2p1-UF4-NEXT: [[TMP10:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 1 +; CHECK-SVE2p1-UF4-NEXT: [[TMP12:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 +; CHECK-SVE2p1-UF4-NEXT: [[TMP14:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP15:%.*]] = mul nuw nsw i64 [[TMP14]], 6 +; CHECK-SVE2p1-UF4-NEXT: [[TMP16:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE2p1-UF4-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-SVE2p1-UF4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE2p1-UF4: vector.body: +; CHECK-SVE2p1-UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY7]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY6]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK10:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE2p1-UF4-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 [[TMP5]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 [[TMP7]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 [[TMP9]] +; CHECK-SVE2p1-UF4-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr [[TMP18]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE2p1-UF4-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP19]], i32 8, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-SVE2p1-UF4-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP20]], i32 8, [[ACTIVE_LANE_MASK9]], poison) +; CHECK-SVE2p1-UF4-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP21]], i32 8, [[ACTIVE_LANE_MASK10]], poison) +; CHECK-SVE2p1-UF4-NEXT: [[TMP22:%.*]] = fmul [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF4-NEXT: [[TMP23:%.*]] = fmul [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF4-NEXT: [[TMP24:%.*]] = fmul [[WIDE_MASKED_LOAD12]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF4-NEXT: [[TMP25:%.*]] = fmul [[WIDE_MASKED_LOAD13]], shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-SVE2p1-UF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i64 [[TMP11]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i64 [[TMP13]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i64 [[TMP15]] +; CHECK-SVE2p1-UF4-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP22]], ptr [[TMP26]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE2p1-UF4-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP23]], ptr [[TMP27]], i32 8, [[ACTIVE_LANE_MASK8]]) +; CHECK-SVE2p1-UF4-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP24]], ptr [[TMP28]], i32 8, [[ACTIVE_LANE_MASK9]]) +; CHECK-SVE2p1-UF4-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP25]], ptr [[TMP29]], i32 8, [[ACTIVE_LANE_MASK10]]) +; CHECK-SVE2p1-UF4-NEXT: [[INDEX_NEXT16]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE2p1-UF4-NEXT: [[TMP30:%.*]] = add i64 [[INDEX_NEXT16]], [[TMP17]] +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP30]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT17:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT16]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT19]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT20]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT17]], i64 2) +; CHECK-SVE2p1-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT21]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT17]], i64 0) +; CHECK-SVE2p1-UF4-NEXT: [[TMP31:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT21]], i64 0 +; CHECK-SVE2p1-UF4-NEXT: br i1 [[TMP31]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-SVE2p1-UF4: for.cond.cleanup: +; CHECK-SVE2p1-UF4-NEXT: ret void +; +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %src, i64 %indvars.iv + %0 = load double, ptr %arrayidx, align 8 + %mul = fmul double %0, 3.000000e+00 + %arrayidx2 = getelementptr inbounds double, ptr %dst, i64 %indvars.iv + store double %mul, ptr %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +attributes #0 = {nounwind vscale_range(1,16) "target-cpu"="neoverse-v1" } +;. +; CHECK-SVE-UF0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE-UF0: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE-UF0: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE-UF0: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +;. +; CHECK-SVE-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE-UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +;. +; CHECK-SVE-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +;. +; CHECK-SVE2p1-UF0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE2p1-UF0: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE2p1-UF0: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE2p1-UF0: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +;. +; CHECK-SVE2p1-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE2p1-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE2p1-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE2p1-UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +;. +; CHECK-SVE2p1-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE2p1-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE2p1-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE2p1-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll index d8f14f30295b6..8c1747c26bdf9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll @@ -40,36 +40,33 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 ; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) -; INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 1 -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] -; INTERLEAVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 1 -; INTERLEAVE-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i64 [[TMP12]] -; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK2]], poison) -; INTERLEAVE-NEXT: [[TMP14:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) -; INTERLEAVE-NEXT: [[TMP15:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD3]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK2]]) -; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] -; INTERLEAVE-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP18:%.*]] = shl i64 [[TMP17]], 1 -; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i64 [[TMP18]] -; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP14]], ptr [[TMP16]], i32 8, [[ACTIVE_LANE_MASK]]) -; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP15]], ptr [[TMP19]], i32 8, [[ACTIVE_LANE_MASK2]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY2]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 +; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr double, ptr [[TMP5]], i64 [[TMP7]] +; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP8]], i32 8, [[ACTIVE_LANE_MASK3]], poison) +; INTERLEAVE-NEXT: [[TMP9:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) +; INTERLEAVE-NEXT: [[TMP10:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD4]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK3]]) +; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP13:%.*]] = shl i64 [[TMP12]], 1 +; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[TMP11]], i64 [[TMP13]] +; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP9]], ptr [[TMP11]], i32 8, [[ACTIVE_LANE_MASK]]) +; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP10]], ptr [[TMP14]], i32 8, [[ACTIVE_LANE_MASK3]]) ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] -; INTERLEAVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP21:%.*]] = shl i64 [[TMP20]], 1 -; INTERLEAVE-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]] -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP22]], i64 [[TMP4]]) -; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT6]], i64 0 ; INTERLEAVE-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; INTERLEAVE: for.cond.cleanup: ; INTERLEAVE-NEXT: ret void @@ -126,36 +123,33 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 ; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) -; INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 1 -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] -; INTERLEAVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 1 -; INTERLEAVE-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i64 [[TMP12]] -; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK2]], poison) -; INTERLEAVE-NEXT: [[TMP14:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) -; INTERLEAVE-NEXT: [[TMP15:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD3]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK2]]) -; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] -; INTERLEAVE-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP18:%.*]] = shl i64 [[TMP17]], 1 -; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i64 [[TMP18]] -; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP14]], ptr [[TMP16]], i32 8, [[ACTIVE_LANE_MASK]]) -; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP15]], ptr [[TMP19]], i32 8, [[ACTIVE_LANE_MASK2]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY2]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 +; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr double, ptr [[TMP5]], i64 [[TMP7]] +; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP8]], i32 8, [[ACTIVE_LANE_MASK3]], poison) +; INTERLEAVE-NEXT: [[TMP9:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) +; INTERLEAVE-NEXT: [[TMP10:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD4]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK3]]) +; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP13:%.*]] = shl i64 [[TMP12]], 1 +; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[TMP11]], i64 [[TMP13]] +; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP9]], ptr [[TMP11]], i32 8, [[ACTIVE_LANE_MASK]]) +; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP10]], ptr [[TMP14]], i32 8, [[ACTIVE_LANE_MASK3]]) ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] -; INTERLEAVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP21:%.*]] = shl i64 [[TMP20]], 1 -; INTERLEAVE-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]] -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP22]], i64 [[TMP4]]) -; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT6]], i64 0 ; INTERLEAVE-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]] ; INTERLEAVE: for.cond.cleanup: ; INTERLEAVE-NEXT: ret void @@ -201,8 +195,8 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 ; INTERLEAVE-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; INTERLEAVE-NEXT: entry: ; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 2) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ne i64 [[N]], 0 ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ugt i64 [[N]], 1 +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ne i64 [[N]], 0 ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] @@ -229,8 +223,8 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 ; INTERLEAVE: pred.store.continue4: ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; INTERLEAVE-NEXT: [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1 -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP0]] ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = icmp ult i64 [[TMP11]], [[TMP0]] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP0]] ; INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK_NEXT]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; INTERLEAVE: for.cond.cleanup: ; INTERLEAVE-NEXT: ret void diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll index 8f3cfc5b20ee9..c9645646ad0ac 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll @@ -56,10 +56,10 @@ define dso_local void @interleave4(ptr noalias nocapture %A, ptr noalias nocaptu ; PREDFLAG: %[[ADD2:.*]] = add i32 %index, 4 ; PREDFLAG: %[[ADD3:.*]] = add i32 %index, 8 ; PREDFLAG: %[[ADD4:.*]] = add i32 %index, 12 -; PREDFLAG: %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %N) -; PREDFLAG: %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %N) -; PREDFLAG: %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %N) ; PREDFLAG: %[[ALM4:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD4]], i32 %N) +; PREDFLAG: %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %N) +; PREDFLAG: %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %N) +; PREDFLAG: %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %N) ; ; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %[[ALM1]],{{.*}} ; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %[[ALM2]],{{.*}} diff --git a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll index 97c84f251fef1..8770a19ce9cfa 100644 --- a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll +++ b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll @@ -49,8 +49,8 @@ define float @pr70988() { ; CHECK-ALM-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-ALM-NEXT: [[TMP0:%.*]] = add i32 [[INDEX1]], 0 ; CHECK-ALM-NEXT: [[TMP1:%.*]] = add i32 [[INDEX1]], 1 -; CHECK-ALM-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ult i32 [[TMP0]], 1021 ; CHECK-ALM-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = icmp ult i32 [[TMP1]], 1021 +; CHECK-ALM-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ult i32 [[TMP0]], 1021 ; CHECK-ALM-NEXT: [[TMP2:%.*]] = select contract i1 [[ACTIVE_LANE_MASK]], float 1.000000e+00, float -0.000000e+00 ; CHECK-ALM-NEXT: [[TMP3:%.*]] = fadd contract float [[VEC_PHI]], [[TMP2]] ; CHECK-ALM-NEXT: [[TMP4:%.*]] = select contract i1 [[ACTIVE_LANE_MASK2]], float 1.000000e+00, float -0.000000e+00 @@ -158,8 +158,8 @@ define float @pr72720reduction_using_active_lane_mask(ptr %src) { ; CHECK-ALM-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[PRED_LOAD_CONTINUE3]] ] ; CHECK-ALM-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-ALM-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 -; CHECK-ALM-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ult i32 [[TMP0]], 15 ; CHECK-ALM-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = icmp ult i32 [[TMP1]], 15 +; CHECK-ALM-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ult i32 [[TMP0]], 15 ; CHECK-ALM-NEXT: br i1 [[ACTIVE_LANE_MASK]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-ALM: pred.load.if: ; CHECK-ALM-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[SRC]], i32 [[TMP0]]