Skip to content

Commit 8cbbc09

Browse files
committed
[LoopVectorize] Add cost of generating tail-folding mask to the loop
At the moment if we decide to enable tail-folding we do not include the cost of generating the mask per VF. This can mean we make some poor choices of VF, which is definitely true for SVE-enabled AArch64 targets where mask generation for fixed-width vectors is more expensive than for scalable vectors. I've added a VPInstruction::computeCost function to return the costs of the ActiveLaneMask and ExplicitVectorLength operations. Unfortunately, in order to prevent asserts firing I've also had to duplicate the same code in the legacy cost model to make sure the chosen VFs match up. I've wrapped this up in a ifndef NDEBUG for now. New tests added: Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
1 parent c72a751 commit 8cbbc09

File tree

10 files changed

+240
-136
lines changed

10 files changed

+240
-136
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5577,6 +5577,34 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
55775577
Cost += BlockCost;
55785578
}
55795579

5580+
#ifndef NDEBUG
5581+
// TODO: We're effectively having to duplicate the code from
5582+
// VPInstruction::computeCost, which is ugly. This isn't meant to be a fully
5583+
// accurate representation of the cost of tail-folding - it exists purely to
5584+
// stop asserts firing when the legacy cost doesn't match the VPlan cost.
5585+
if (!VF.isScalar() && foldTailByMasking()) {
5586+
TailFoldingStyle Style = getTailFoldingStyle();
5587+
LLVMContext &Context = TheLoop->getHeader()->getContext();
5588+
Type *I1Ty = IntegerType::getInt1Ty(Context);
5589+
Type *IndTy = Legal->getWidestInductionType();
5590+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5591+
if (Style == TailFoldingStyle::DataWithEVL) {
5592+
Type *I32Ty = IntegerType::getInt32Ty(Context);
5593+
IntrinsicCostAttributes Attrs(
5594+
Intrinsic::experimental_get_vector_length, I32Ty,
5595+
{PoisonValue::get(IndTy), PoisonValue::get(I32Ty),
5596+
PoisonValue::get(I1Ty)});
5597+
Cost += TTI.getIntrinsicInstrCost(Attrs, CostKind);
5598+
} else if (useActiveLaneMask(Style)) {
5599+
VectorType *RetTy = VectorType::get(I1Ty, VF);
5600+
IntrinsicCostAttributes Attrs(
5601+
Intrinsic::get_active_lane_mask, RetTy,
5602+
{PoisonValue::get(IndTy), PoisonValue::get(IndTy)});
5603+
Cost += TTI.getIntrinsicInstrCost(Attrs, CostKind);
5604+
}
5605+
}
5606+
#endif
5607+
55805608
return Cost;
55815609
}
55825610

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,6 +1378,10 @@ class VPInstruction : public VPRecipeWithIRFlags,
13781378

13791379
/// Returns the symbolic name assigned to the VPInstruction.
13801380
StringRef getName() const { return Name; }
1381+
1382+
/// Return the cost of this VPWidenRecipe.
1383+
InstructionCost computeCost(ElementCount VF,
1384+
VPCostContext &Ctx) const override;
13811385
};
13821386

13831387
/// A recipe to wrap on original IR instruction not to be modified during

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
5858
CachedTypes[OtherV] = ResTy;
5959
return ResTy;
6060
}
61+
case VPInstruction::CalculateTripCountMinusVF: {
62+
return inferScalarType(R->getOperand(0));
63+
}
6164
case Instruction::ICmp:
6265
case VPInstruction::ActiveLaneMask:
6366
return inferScalarType(R->getOperand(1));

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,33 @@ VPInstruction::VPInstruction(unsigned Opcode,
356356
assert(isFPMathOp() && "this op can't take fast-math flags");
357357
}
358358

359+
InstructionCost VPInstruction::computeCost(ElementCount VF,
360+
VPCostContext &Ctx) const {
361+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
362+
363+
switch (getOpcode()) {
364+
case VPInstruction::ActiveLaneMask: {
365+
Type *ArgTy = Ctx.Types.inferScalarType(getOperand(1));
366+
Type *RetTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
367+
IntrinsicCostAttributes Attrs(
368+
Intrinsic::get_active_lane_mask, RetTy,
369+
{PoisonValue::get(ArgTy), PoisonValue::get(ArgTy)});
370+
return Ctx.TTI.getIntrinsicInstrCost(Attrs, CostKind);
371+
}
372+
case VPInstruction::ExplicitVectorLength: {
373+
Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
374+
Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
375+
IntrinsicCostAttributes Attrs(
376+
Intrinsic::experimental_get_vector_length, I32Ty,
377+
{PoisonValue::get(I32Ty), PoisonValue::get(I1Ty)});
378+
return Ctx.TTI.getIntrinsicInstrCost(Attrs, CostKind);
379+
}
380+
default:
381+
// TODO: Fill out other opcodes!
382+
return 0;
383+
}
384+
}
385+
359386
bool VPInstruction::doesGeneratePerAllLanes() const {
360387
return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
361388
}

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 34 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1084,7 +1084,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
10841084
; DEFAULT-SAME: ptr noalias [[SRC_1:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[SRC_3:%.*]], ptr noalias [[SRC_4:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
10851085
; DEFAULT-NEXT: entry:
10861086
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
1087-
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
1087+
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
10881088
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
10891089
; DEFAULT: vector.scevcheck:
10901090
; DEFAULT-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
@@ -1326,7 +1326,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
13261326
; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW7]]
13271327
; PRED-NEXT: [[TMP13:%.*]] = or i1 [[TMP4]], [[TMP8]]
13281328
; PRED-NEXT: [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]]
1329-
; PRED-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1329+
; PRED-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[ENTRY:%.*]]
13301330
; PRED: vector.ph:
13311331
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 7
13321332
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
@@ -1335,16 +1335,16 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
13351335
; PRED-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[TMP0]], 8
13361336
; PRED-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0
13371337
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 [[TMP0]])
1338-
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
1338+
; PRED-NEXT: br label [[LOOP_HEADER:%.*]]
13391339
; PRED: vector.body:
1340-
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE27:%.*]] ]
1341-
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ]
1342-
; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE27]] ]
1343-
; PRED-NEXT: [[TMP18:%.*]] = load float, ptr [[SRC_1]], align 4
1344-
; PRED-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
1340+
; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
1341+
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[LOOP_LATCH]] ]
1342+
; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_LATCH]] ]
1343+
; PRED-NEXT: [[TMP86:%.*]] = load float, ptr [[SRC_1]], align 4
1344+
; PRED-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP86]], i64 0
13451345
; PRED-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer
1346-
; PRED-NEXT: [[TMP19:%.*]] = load float, ptr [[SRC_2]], align 4
1347-
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
1346+
; PRED-NEXT: [[TMP87:%.*]] = load float, ptr [[SRC_2]], align 4
1347+
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP87]], i64 0
13481348
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
13491349
; PRED-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[BROADCAST_SPLAT]], zeroinitializer
13501350
; PRED-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[TMP20]])
@@ -1464,7 +1464,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
14641464
; PRED-NEXT: br label [[PRED_STORE_CONTINUE25]]
14651465
; PRED: pred.store.continue25:
14661466
; PRED-NEXT: [[TMP77:%.*]] = extractelement <8 x i1> [[TMP26]], i32 7
1467-
; PRED-NEXT: br i1 [[TMP77]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27]]
1467+
; PRED-NEXT: br i1 [[TMP77]], label [[PRED_STORE_IF26:%.*]], label [[LOOP_LATCH]]
14681468
; PRED: pred.store.if26:
14691469
; PRED-NEXT: [[TMP78:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
14701470
; PRED-NEXT: store float 0.000000e+00, ptr [[TMP78]], align 4
@@ -1476,44 +1476,44 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
14761476
; PRED-NEXT: store float 0.000000e+00, ptr [[TMP82]], align 4
14771477
; PRED-NEXT: [[TMP83:%.*]] = extractelement <8 x ptr> [[TMP27]], i32 7
14781478
; PRED-NEXT: store float 0.000000e+00, ptr [[TMP83]], align 4
1479-
; PRED-NEXT: br label [[PRED_STORE_CONTINUE27]]
1479+
; PRED-NEXT: br label [[LOOP_LATCH]]
14801480
; PRED: pred.store.continue27:
1481-
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
1482-
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 [[TMP17]])
1481+
; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8
1482+
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[IV]], i64 [[TMP17]])
14831483
; PRED-NEXT: [[TMP84:%.*]] = xor <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
14841484
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
14851485
; PRED-NEXT: [[TMP85:%.*]] = extractelement <8 x i1> [[TMP84]], i32 0
1486-
; PRED-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
1486+
; PRED-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]]
14871487
; PRED: middle.block:
14881488
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
14891489
; PRED: scalar.ph:
1490-
; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
1491-
; PRED-NEXT: br label [[LOOP_HEADER:%.*]]
1490+
; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
1491+
; PRED-NEXT: br label [[LOOP_HEADER1:%.*]]
14921492
; PRED: loop.header:
1493-
; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
1494-
; PRED-NEXT: [[TMP86:%.*]] = load float, ptr [[SRC_1]], align 4
1495-
; PRED-NEXT: [[TMP87:%.*]] = load float, ptr [[SRC_2]], align 4
1496-
; PRED-NEXT: [[MUL8_I_US:%.*]] = fmul float [[TMP87]], 0.000000e+00
1497-
; PRED-NEXT: [[TMP88:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP86]], float 0.000000e+00, float [[MUL8_I_US]])
1493+
; PRED-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[LOOP_LATCH1:%.*]] ]
1494+
; PRED-NEXT: [[TMP92:%.*]] = load float, ptr [[SRC_1]], align 4
1495+
; PRED-NEXT: [[TMP93:%.*]] = load float, ptr [[SRC_2]], align 4
1496+
; PRED-NEXT: [[MUL8_I_US:%.*]] = fmul float [[TMP93]], 0.000000e+00
1497+
; PRED-NEXT: [[TMP88:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP92]], float 0.000000e+00, float [[MUL8_I_US]])
14981498
; PRED-NEXT: [[TMP89:%.*]] = load float, ptr [[SRC_3]], align 4
14991499
; PRED-NEXT: [[TMP90:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP89]], float 0.000000e+00, float [[TMP88]])
15001500
; PRED-NEXT: [[TMP91:%.*]] = load float, ptr [[SRC_3]], align 4
15011501
; PRED-NEXT: [[C:%.*]] = fcmp ogt float [[TMP90]], [[TMP91]]
1502-
; PRED-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
1502+
; PRED-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH1]]
15031503
; PRED: if.then:
1504-
; PRED-NEXT: [[DST_0:%.*]] = getelementptr { [4 x float] }, ptr [[DST]], i64 [[IV]]
1504+
; PRED-NEXT: [[DST_0:%.*]] = getelementptr { [4 x float] }, ptr [[DST]], i64 [[IV1]]
15051505
; PRED-NEXT: store float 0.000000e+00, ptr [[DST_0]], align 4
15061506
; PRED-NEXT: [[DST_1:%.*]] = getelementptr i8, ptr [[DST_0]], i64 4
15071507
; PRED-NEXT: store float 0.000000e+00, ptr [[DST_1]], align 4
15081508
; PRED-NEXT: [[DST_2:%.*]] = getelementptr i8, ptr [[DST_0]], i64 8
15091509
; PRED-NEXT: store float 0.000000e+00, ptr [[DST_2]], align 4
15101510
; PRED-NEXT: [[DST_3:%.*]] = getelementptr i8, ptr [[DST_0]], i64 16
15111511
; PRED-NEXT: store float 0.000000e+00, ptr [[DST_0]], align 4
1512-
; PRED-NEXT: br label [[LOOP_LATCH]]
1512+
; PRED-NEXT: br label [[LOOP_LATCH1]]
15131513
; PRED: loop.latch:
1514-
; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
1515-
; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
1516-
; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP24:![0-9]+]]
1514+
; PRED-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1
1515+
; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV1]], [[N]]
1516+
; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP24:![0-9]+]]
15171517
; PRED: exit:
15181518
; PRED-NEXT: ret void
15191519
;
@@ -1546,7 +1546,7 @@ if.then:
15461546
loop.latch:
15471547
%iv.next = add i64 %iv, 1
15481548
%ec = icmp eq i64 %iv, %N
1549-
br i1 %ec, label %exit, label %loop.header
1549+
br i1 %ec, label %exit, label %loop.header, !llvm.loop !1
15501550

15511551
exit:
15521552
ret void
@@ -1706,6 +1706,11 @@ declare float @llvm.fmuladd.f32(float, float, float) #1
17061706
attributes #1 = { "target-cpu"="neoverse-512tvb" }
17071707
attributes #2 = { vscale_range(2,2) "target-cpu"="neoverse-512tvb" }
17081708

1709+
1710+
!1 = distinct !{!1, !2, !3, !4}
1711+
!2 = !{!"llvm.loop.vectorize.width", i32 8}
1712+
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
1713+
!4 = !{!"llvm.loop.vectorize.enable", i1 true}
17091714
;.
17101715
; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
17111716
; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -143,49 +143,49 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
143143
; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
144144
; PRED: vector.memcheck:
145145
; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
146-
; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8
146+
; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 16
147147
; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
148148
; PRED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
149149
; PRED-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
150150
; PRED: vector.ph:
151151
; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
152-
; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
152+
; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
153153
; PRED-NEXT: [[TMP8:%.*]] = sub i64 [[TMP5]], 1
154154
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP8]]
155155
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
156156
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
157157
; PRED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
158-
; PRED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8
158+
; PRED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 16
159159
; PRED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
160-
; PRED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
160+
; PRED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16
161161
; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
162162
; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], [[TMP12]]
163163
; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
164-
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]])
165-
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
166-
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
167-
; PRED-NEXT: [[TMP16:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
164+
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP0]])
165+
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[X]], i64 0
166+
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
167+
; PRED-NEXT: [[TMP16:%.*]] = trunc <vscale x 16 x i32> [[BROADCAST_SPLAT]] to <vscale x 16 x i16>
168168
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
169169
; PRED: vector.body:
170170
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
171-
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
171+
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
172172
; PRED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 0
173173
; PRED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
174174
; PRED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
175-
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
176-
; PRED-NEXT: [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
177-
; PRED-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i16> [[TMP20]], [[TMP16]]
178-
; PRED-NEXT: [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
179-
; PRED-NEXT: [[TMP23:%.*]] = or <vscale x 8 x i16> [[TMP21]], [[TMP22]]
180-
; PRED-NEXT: [[TMP24:%.*]] = lshr <vscale x 8 x i16> [[TMP23]], trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
181-
; PRED-NEXT: [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP24]] to <vscale x 8 x i8>
175+
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
176+
; PRED-NEXT: [[TMP24:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
177+
; PRED-NEXT: [[TMP25:%.*]] = mul <vscale x 16 x i16> [[TMP24]], [[TMP16]]
178+
; PRED-NEXT: [[TMP20:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
179+
; PRED-NEXT: [[TMP21:%.*]] = or <vscale x 16 x i16> [[TMP25]], [[TMP20]]
180+
; PRED-NEXT: [[TMP22:%.*]] = lshr <vscale x 16 x i16> [[TMP21]], trunc (<vscale x 16 x i32> shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i16>)
181+
; PRED-NEXT: [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP22]] to <vscale x 16 x i8>
182182
; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
183183
; PRED-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
184-
; PRED-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
184+
; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
185185
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
186-
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
187-
; PRED-NEXT: [[TMP28:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
188-
; PRED-NEXT: [[TMP29:%.*]] = extractelement <vscale x 8 x i1> [[TMP28]], i32 0
186+
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
187+
; PRED-NEXT: [[TMP28:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
188+
; PRED-NEXT: [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[TMP28]], i32 0
189189
; PRED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
190190
; PRED: middle.block:
191191
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]

0 commit comments

Comments
 (0)