diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 58fcba93f1a18..4a5cde57bade1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5577,6 +5577,34 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { Cost += BlockCost; } +#ifndef NDEBUG + // TODO: We're effectively having to duplicate the code from + // VPInstruction::computeCost, which is ugly. This isn't meant to be a fully + // accurate representation of the cost of tail-folding - it exists purely to + // stop asserts firing when the legacy cost doesn't match the VPlan cost. + if (!VF.isScalar() && foldTailByMasking()) { + TailFoldingStyle Style = getTailFoldingStyle(); + LLVMContext &Context = TheLoop->getHeader()->getContext(); + Type *I1Ty = IntegerType::getInt1Ty(Context); + Type *IndTy = Legal->getWidestInductionType(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + if (Style == TailFoldingStyle::DataWithEVL) { + Type *I32Ty = IntegerType::getInt32Ty(Context); + IntrinsicCostAttributes Attrs( + Intrinsic::experimental_get_vector_length, I32Ty, + {PoisonValue::get(IndTy), PoisonValue::get(I32Ty), + PoisonValue::get(I1Ty)}); + Cost += TTI.getIntrinsicInstrCost(Attrs, CostKind); + } else if (useActiveLaneMask(Style)) { + VectorType *RetTy = VectorType::get(I1Ty, VF); + IntrinsicCostAttributes Attrs( + Intrinsic::get_active_lane_mask, RetTy, + {PoisonValue::get(IndTy), PoisonValue::get(IndTy)}); + Cost += TTI.getIntrinsicInstrCost(Attrs, CostKind); + } + } +#endif + return Cost; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 4e5878cae2ddc..5ff7bbec00058 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1378,6 +1378,10 @@ class VPInstruction : public VPRecipeWithIRFlags, /// Returns the symbolic name assigned to the VPInstruction. StringRef getName() const { return Name; } + + /// Return the cost of this VPWidenRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; }; /// A recipe to wrap on original IR instruction not to be modified during diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 8b8ab6be99b0d..bc5854515fdb8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -58,6 +58,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { CachedTypes[OtherV] = ResTy; return ResTy; } + case VPInstruction::CalculateTripCountMinusVF: { + return inferScalarType(R->getOperand(0)); + } case Instruction::ICmp: case VPInstruction::ActiveLaneMask: return inferScalarType(R->getOperand(1)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2ecd546633825..8425b298bf531 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -356,6 +356,33 @@ VPInstruction::VPInstruction(unsigned Opcode, assert(isFPMathOp() && "this op can't take fast-math flags"); } +InstructionCost VPInstruction::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + switch (getOpcode()) { + case VPInstruction::ActiveLaneMask: { + Type *ArgTy = Ctx.Types.inferScalarType(getOperand(1)); + Type *RetTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF); + IntrinsicCostAttributes Attrs( + Intrinsic::get_active_lane_mask, RetTy, + {PoisonValue::get(ArgTy), PoisonValue::get(ArgTy)}); + return Ctx.TTI.getIntrinsicInstrCost(Attrs, CostKind); + } + case VPInstruction::ExplicitVectorLength: { + Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx); + Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx); + IntrinsicCostAttributes Attrs( + Intrinsic::experimental_get_vector_length, I32Ty, + {PoisonValue::get(I32Ty), PoisonValue::get(I1Ty)}); + return Ctx.TTI.getIntrinsicInstrCost(Attrs, CostKind); + } + default: + // TODO: Fill out other opcodes! + return 0; + } +} + bool VPInstruction::doesGeneratePerAllLanes() const { return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 7f325ce1a1f04..2568087b38791 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -1084,7 +1084,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; DEFAULT-SAME: ptr noalias [[SRC_1:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[SRC_3:%.*]], ptr noalias [[SRC_4:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] { ; DEFAULT-NEXT: entry: ; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 ; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; DEFAULT: vector.scevcheck: ; DEFAULT-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4 @@ -1326,7 +1326,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW7]] ; PRED-NEXT: [[TMP13:%.*]] = or i1 [[TMP4]], [[TMP8]] ; PRED-NEXT: [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]] -; PRED-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; PRED-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[ENTRY:%.*]] ; PRED: vector.ph: ; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 7 ; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8 @@ -1335,16 +1335,16 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; PRED-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[TMP0]], 8 ; PRED-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0 ; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: br label [[VECTOR_BODY:%.*]] +; PRED-NEXT: br label [[LOOP_HEADER:%.*]] ; PRED: vector.body: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE27:%.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ] -; PRED-NEXT: [[TMP18:%.*]] = load float, ptr [[SRC_1]], align 4 -; PRED-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0 +; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[LOOP_LATCH]] ] +; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_LATCH]] ] +; PRED-NEXT: [[TMP86:%.*]] = load float, ptr [[SRC_1]], align 4 +; PRED-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP86]], i64 0 ; PRED-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer -; PRED-NEXT: [[TMP19:%.*]] = load float, ptr [[SRC_2]], align 4 -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 +; PRED-NEXT: [[TMP87:%.*]] = load float, ptr [[SRC_2]], align 4 +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP87]], i64 0 ; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer ; PRED-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[BROADCAST_SPLAT]], zeroinitializer ; PRED-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[TMP20]]) @@ -1464,7 +1464,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; PRED-NEXT: br label [[PRED_STORE_CONTINUE25]] ; PRED: pred.store.continue25: ; PRED-NEXT: [[TMP77:%.*]] = extractelement <8 x i1> [[TMP26]], i32 7 -; PRED-NEXT: br i1 [[TMP77]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27]] +; PRED-NEXT: br i1 [[TMP77]], label [[PRED_STORE_IF26:%.*]], label [[LOOP_LATCH]] ; PRED: pred.store.if26: ; PRED-NEXT: [[TMP78:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7 ; PRED-NEXT: store float 0.000000e+00, ptr [[TMP78]], align 4 @@ -1476,32 +1476,32 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; PRED-NEXT: store float 0.000000e+00, ptr [[TMP82]], align 4 ; PRED-NEXT: [[TMP83:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7 ; PRED-NEXT: store float 0.000000e+00, ptr [[TMP83]], align 4 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE27]] +; PRED-NEXT: br label [[LOOP_LATCH]] ; PRED: pred.store.continue27: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 [[TMP17]]) +; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8 +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[IV]], i64 [[TMP17]]) ; PRED-NEXT: [[TMP84:%.*]] = xor <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], ; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], ; PRED-NEXT: [[TMP85:%.*]] = extractelement <8 x i1> [[TMP84]], i32 0 -; PRED-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; PRED-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]] ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; PRED-NEXT: br label [[LOOP_HEADER:%.*]] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: br label [[LOOP_HEADER1:%.*]] ; PRED: loop.header: -; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; PRED-NEXT: [[TMP86:%.*]] = load float, ptr [[SRC_1]], align 4 -; PRED-NEXT: [[TMP87:%.*]] = load float, ptr [[SRC_2]], align 4 -; PRED-NEXT: [[MUL8_I_US:%.*]] = fmul float [[TMP87]], 0.000000e+00 -; PRED-NEXT: [[TMP88:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP86]], float 0.000000e+00, float [[MUL8_I_US]]) +; PRED-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[LOOP_LATCH1:%.*]] ] +; PRED-NEXT: [[TMP92:%.*]] = load float, ptr [[SRC_1]], align 4 +; PRED-NEXT: [[TMP93:%.*]] = load float, ptr [[SRC_2]], align 4 +; PRED-NEXT: [[MUL8_I_US:%.*]] = fmul float [[TMP93]], 0.000000e+00 +; PRED-NEXT: [[TMP88:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP92]], float 0.000000e+00, float [[MUL8_I_US]]) ; PRED-NEXT: [[TMP89:%.*]] = load float, ptr [[SRC_3]], align 4 ; PRED-NEXT: [[TMP90:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP89]], float 0.000000e+00, float [[TMP88]]) ; PRED-NEXT: [[TMP91:%.*]] = load float, ptr [[SRC_3]], align 4 ; PRED-NEXT: [[C:%.*]] = fcmp ogt float [[TMP90]], [[TMP91]] -; PRED-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]] +; PRED-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH1]] ; PRED: if.then: -; PRED-NEXT: [[DST_0:%.*]] = getelementptr { [4 x float] }, ptr [[DST]], i64 [[IV]] +; PRED-NEXT: [[DST_0:%.*]] = getelementptr { [4 x float] }, ptr [[DST]], i64 [[IV1]] ; PRED-NEXT: store float 0.000000e+00, ptr [[DST_0]], align 4 ; PRED-NEXT: [[DST_1:%.*]] = getelementptr i8, ptr [[DST_0]], i64 4 ; PRED-NEXT: store float 0.000000e+00, ptr [[DST_1]], align 4 @@ -1509,11 +1509,11 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; PRED-NEXT: store float 0.000000e+00, ptr [[DST_2]], align 4 ; PRED-NEXT: [[DST_3:%.*]] = getelementptr i8, ptr [[DST_0]], i64 16 ; PRED-NEXT: store float 0.000000e+00, ptr [[DST_0]], align 4 -; PRED-NEXT: br label [[LOOP_LATCH]] +; PRED-NEXT: br label [[LOOP_LATCH1]] ; PRED: loop.latch: -; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP24:![0-9]+]] +; PRED-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1 +; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV1]], [[N]] +; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP24:![0-9]+]] ; PRED: exit: ; PRED-NEXT: ret void ; @@ -1546,7 +1546,7 @@ if.then: loop.latch: %iv.next = add i64 %iv, 1 %ec = icmp eq i64 %iv, %N - br i1 %ec, label %exit, label %loop.header + br i1 %ec, label %exit, label %loop.header, !llvm.loop !1 exit: ret void @@ -1706,6 +1706,11 @@ declare float @llvm.fmuladd.f32(float, float, float) #1 attributes #1 = { "target-cpu"="neoverse-512tvb" } attributes #2 = { vscale_range(2,2) "target-cpu"="neoverse-512tvb" } + +!1 = distinct !{!1, !2, !3, !4} +!2 = !{!"llvm.loop.vectorize.width", i32 8} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} ;. ; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index 553989544c778..81283a7f54c8e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -143,49 +143,49 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 { ; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; PRED: vector.memcheck: ; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 +; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 16 ; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]] ; PRED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]] ; PRED-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; PRED: vector.ph: ; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 ; PRED-NEXT: [[TMP8:%.*]] = sub i64 [[TMP5]], 1 ; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP8]] ; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; PRED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8 +; PRED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 16 ; PRED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; PRED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16 ; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]] ; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], [[TMP12]] ; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; PRED-NEXT: [[TMP16:%.*]] = trunc [[BROADCAST_SPLAT]] to +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP0]]) +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[TMP16:%.*]] = trunc [[BROADCAST_SPLAT]] to ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] ; PRED: vector.body: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; PRED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 0 ; PRED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]] ; PRED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0 -; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; PRED-NEXT: [[TMP20:%.*]] = zext [[WIDE_MASKED_LOAD]] to -; PRED-NEXT: [[TMP21:%.*]] = mul [[TMP20]], [[TMP16]] -; PRED-NEXT: [[TMP22:%.*]] = zext [[WIDE_MASKED_LOAD]] to -; PRED-NEXT: [[TMP23:%.*]] = or [[TMP21]], [[TMP22]] -; PRED-NEXT: [[TMP24:%.*]] = lshr [[TMP23]], trunc ( shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) to ) -; PRED-NEXT: [[TMP25:%.*]] = trunc [[TMP24]] to +; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; PRED-NEXT: [[TMP24:%.*]] = zext [[WIDE_MASKED_LOAD]] to +; PRED-NEXT: [[TMP25:%.*]] = mul [[TMP24]], [[TMP16]] +; PRED-NEXT: [[TMP20:%.*]] = zext [[WIDE_MASKED_LOAD]] to +; PRED-NEXT: [[TMP21:%.*]] = or [[TMP25]], [[TMP20]] +; PRED-NEXT: [[TMP22:%.*]] = lshr [[TMP21]], trunc ( shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) to ) +; PRED-NEXT: [[TMP23:%.*]] = trunc [[TMP22]] to ; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]] ; PRED-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0 -; PRED-NEXT: call void @llvm.masked.store.nxv8i8.p0( [[TMP25]], ptr [[TMP27]], i32 1, [[ACTIVE_LANE_MASK]]) +; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP23]], ptr [[TMP27]], i32 1, [[ACTIVE_LANE_MASK]]) ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; PRED-NEXT: [[TMP28:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; PRED-NEXT: [[TMP29:%.*]] = extractelement [[TMP28]], i32 0 +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]]) +; PRED-NEXT: [[TMP28:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; PRED-NEXT: [[TMP29:%.*]] = extractelement [[TMP28]], i32 0 ; PRED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll index 1eab166b2e553..5c178adccd3ba 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll @@ -367,40 +367,40 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 { ; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; PRED: vector.ph: ; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 ; PRED-NEXT: [[TMP5:%.*]] = sub i64 [[TMP2]], 1 ; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP5]] ; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]] ; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 ; PRED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 +; PRED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 ; PRED-NEXT: [[TMP10:%.*]] = sub i64 [[TMP0]], [[TMP9]] ; PRED-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], [[TMP9]] ; PRED-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]]) +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[X]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] ; PRED: vector.body: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PRED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; PRED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; PRED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 ; PRED-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP13]] ; PRED-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[TMP14]], i32 0 -; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i16.p0(ptr [[TMP15]], i32 2, [[ACTIVE_LANE_MASK]], poison) -; PRED-NEXT: [[TMP19:%.*]] = udiv [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; PRED-NEXT: [[TMP20:%.*]] = or [[TMP19]], [[VEC_PHI]] -; PRED-NEXT: [[TMP16]] = select [[ACTIVE_LANE_MASK]], [[TMP20]], [[VEC_PHI]] +; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[TMP15]], i32 2, [[ACTIVE_LANE_MASK]], poison) +; PRED-NEXT: [[TMP19:%.*]] = udiv [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] +; PRED-NEXT: [[TMP20:%.*]] = or [[TMP19]], [[VEC_PHI]] +; PRED-NEXT: [[TMP16]] = select [[ACTIVE_LANE_MASK]], [[TMP20]], [[VEC_PHI]] ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]]) -; PRED-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; PRED-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP12]]) +; PRED-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; PRED-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 ; PRED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PRED: middle.block: -; PRED-NEXT: [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16( [[TMP16]]) +; PRED-NEXT: [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16( [[TMP16]]) ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: ; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll index af266653af803..c0c5c8a7da26e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll @@ -1,4 +1,7 @@ -; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue <%s | FileCheck %s +; REQUIRES: asserts +; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -debug-only=loop-vectorize < %s 2>%t | FileCheck %s +; RUN: cat %t | FileCheck --check-prefix=COST %s target triple = "aarch64-unknown-linux-gnu" @@ -32,4 +35,29 @@ for.end: ; preds = %for.body ret i32 0 } +; COST: LV: Checking a loop in 'simple_memset' +; COST: Cost of 4 for VF 2: EMIT{{.*}}active lane mask +; COST: Cost of 8 for VF 4: EMIT{{.*}}active lane mask +; COST: Cost of Invalid for VF vscale x 1: EMIT{{.*}}active lane mask +; COST: Cost of 1 for VF vscale x 2: EMIT{{.*}}active lane mask +; COST: Cost of 1 for VF vscale x 4: EMIT{{.*}}active lane mask + +define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { +; CHECK-LABEL: @simple_memset( +; CHECK: call void @llvm.masked.store.nxv4i32.p0( +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] + %gep = getelementptr i32, ptr %ptr, i64 %index + store i32 %val, ptr %gep + %index.next = add nsw i64 %index, 1 + %cmp10 = icmp ult i64 %index.next, %n + br i1 %cmp10, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + ret void +} + attributes #0 = { vscale_range(1,16) "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index 06a4f98d3dc72..fc5a4184a1ad5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -312,38 +312,38 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; SCALABLE-LABEL: @conditional_uniform_load( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i64() -; SCALABLE-NEXT: [[TMP7:%.*]] = add [[TMP6]], zeroinitializer -; SCALABLE-NEXT: [[TMP8:%.*]] = mul [[TMP7]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] +; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() +; SCALABLE-NEXT: [[TMP7:%.*]] = add [[TMP6]], zeroinitializer +; SCALABLE-NEXT: [[TMP8:%.*]] = mul [[TMP7]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] ; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]] -; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 -; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], i32 8, [[TMP11]], poison) -; SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP11]], [[WIDE_MASKED_GATHER]], zeroinitializer +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i64.nxv4p0( [[BROADCAST_SPLAT]], i32 8, [[TMP11]], poison) +; SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP11]], [[WIDE_MASKED_GATHER]], zeroinitializer ; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]] ; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 -; SCALABLE-NEXT: store [[PREDPHI]], ptr [[TMP13]], align 8 +; SCALABLE-NEXT: store [[PREDPHI]], ptr [[TMP13]], align 8 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: middle.block: @@ -423,44 +423,44 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 ; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] ; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TF-SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv2i64() -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv4i64() +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP4]] -; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 -; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: br label [[FOR_BODY:%.*]] ; TF-SCALABLE: vector.body: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] +; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ] ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 1025) -; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[TMP11:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer -; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], i32 8, [[TMP11]], poison) -; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP11]], [[WIDE_MASKED_GATHER]], zeroinitializer +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 1025) +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer +; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i64.nxv4p0( [[BROADCAST_SPLAT]], i32 8, [[TMP11]], poison) +; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP11]], [[WIDE_MASKED_GATHER]], zeroinitializer ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP9]] ; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv4i64.p0( [[PREDPHI]], ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK]]) ; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] -; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; TF-SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: ; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; TF-SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; TF-SCALABLE-NEXT: br label [[FOR_BODY1:%.*]] ; TF-SCALABLE: for.body: ; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; TF-SCALABLE-NEXT: [[CMP:%.*]] = icmp ugt i64 [[IV]], 10 @@ -469,12 +469,12 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8 ; TF-SCALABLE-NEXT: br label [[LATCH]] ; TF-SCALABLE: latch: -; TF-SCALABLE-NEXT: [[PHI:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[V]], [[DO_LOAD]] ] +; TF-SCALABLE-NEXT: [[PHI:%.*]] = phi i64 [ 0, [[FOR_BODY1]] ], [ [[V]], [[DO_LOAD]] ] ; TF-SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; TF-SCALABLE-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP5:![0-9]+]] ; TF-SCALABLE: for.end: ; TF-SCALABLE-NEXT: ret void ; @@ -484,10 +484,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-FIXEDLEN: vector.ph: ; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0 ; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] +; TF-FIXEDLEN-NEXT: br label [[FOR_BODY:%.*]] ; TF-FIXEDLEN: vector.body: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] +; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 1025) ; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], @@ -500,12 +500,12 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-FIXEDLEN-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TF-FIXEDLEN: middle.block: ; TF-FIXEDLEN-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-FIXEDLEN: scalar.ph: ; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1028, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; TF-FIXEDLEN-NEXT: br label [[FOR_BODY:%.*]] +; TF-FIXEDLEN-NEXT: br label [[FOR_BODY1:%.*]] ; TF-FIXEDLEN: for.body: ; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; TF-FIXEDLEN-NEXT: [[CMP:%.*]] = icmp ugt i64 [[IV]], 10 @@ -514,12 +514,12 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8 ; TF-FIXEDLEN-NEXT: br label [[LATCH]] ; TF-FIXEDLEN: latch: -; TF-FIXEDLEN-NEXT: [[PHI:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[V]], [[DO_LOAD]] ] +; TF-FIXEDLEN-NEXT: [[PHI:%.*]] = phi i64 [ 0, [[FOR_BODY1]] ], [ [[V]], [[DO_LOAD]] ] ; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; TF-FIXEDLEN-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 ; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP5:![0-9]+]] ; TF-FIXEDLEN: for.end: ; TF-FIXEDLEN-NEXT: ret void ; @@ -540,7 +540,7 @@ latch: store i64 %phi, ptr %arrayidx %iv.next = add nuw nsw i64 %iv, 1 %exitcond.not = icmp eq i64 %iv.next, 1025 - br i1 %exitcond.not, label %for.end, label %for.body + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 for.end: ret void @@ -1517,3 +1517,7 @@ for.body: for.end: ret void } + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll index c492b296903e6..bd6bdb6ee573f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll @@ -32,34 +32,34 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP0]], [[EVL_BASED_IV]] -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 2, i1 true) +; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[TMP0]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]] -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = sub nuw nsw i64 1, [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP9]] -; CHECK-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP9]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP14]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i64 [[TMP18]] -; CHECK-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv2i64( zeroinitializer, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_REVERSE]], ptr align 8 [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[IV:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = sub nuw nsw i64 1, [[IV]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP9]] +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP9]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i64, ptr [[TMP16]], i64 [[TMP15]] +; CHECK-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv2i64( zeroinitializer, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_REVERSE]], ptr align 8 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP18:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[TMP23:%.*]] = sub nuw nsw i64 1, [[IV]] -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP23]] -; CHECK-NEXT: store i64 0, ptr [[ARRAYIDX13]], align 8 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[TMP20:%.*]] = sub nuw nsw i64 1, [[IV1]] +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP20]] +; CHECK-NEXT: store i64 0, ptr [[ARRAYIDX14]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: @@ -77,11 +77,16 @@ loop: store i64 0, ptr %arrayidx13, align 8 %iv.next = add nuw nsw i64 %iv, 1 %exitcond.not = icmp eq i64 %iv.next, 3 - br i1 %exitcond.not, label %exit, label %loop + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !1 exit: ret void } + +!1 = distinct !{!1, !2, !3, !4} +!2 = !{!"llvm.loop.vectorize.width", i32 2} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}