From 49ca38600c0a60820f0a2716b9035512660f0563 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 20 Jun 2024 21:16:36 -0700 Subject: [PATCH 01/35] Remove the FIXME --- .../Transforms/Vectorize/VPlanTransforms.cpp | 9 +- .../LoopVectorize/RISCV/inloop-reduction.ll | 57 +- ...rize-force-tail-with-evl-cond-reduction.ll | 146 ++- ...ze-force-tail-with-evl-inloop-reduction.ll | 136 ++- ...-force-tail-with-evl-intermediate-store.ll | 78 +- ...vectorize-force-tail-with-evl-reduction.ll | 943 +++++++++++++++--- .../RISCV/vplan-vp-intrinsics-reduction.ll | 46 +- 7 files changed, 1228 insertions(+), 187 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a4f0df17f5832..21721dbce1e7c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1553,14 +1553,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength( return isa( &Phi); }); - // FIXME: Remove this once we can transform (select header_mask, true_value, - // false_value) into vp.merge. - bool ContainsOutloopReductions = - any_of(Header->phis(), [&](VPRecipeBase &Phi) { - auto *R = dyn_cast(&Phi); - return R && !R->isInLoop(); - }); - if (ContainsWidenInductions || ContainsOutloopReductions) + if (ContainsWidenInductions) return false; auto *CanonicalIVPHI = Plan.getCanonicalIV(); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll index 0381f6dae9811..c6305e21920fe 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -122,19 +122,62 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; IF-EVL-OUTLOOP-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; IF-EVL-OUTLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; IF-EVL-OUTLOOP: for.body.preheader: +; IF-EVL-OUTLOOP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL-OUTLOOP: vector.ph: +; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-OUTLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], 1 +; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP2]] +; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] +; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1 +; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-OUTLOOP: vector.body: +; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP5]], i32 4, i1 true) +; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = add i32 [[EVL_BASED_IV]], 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[EVL_BASED_IV]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = add zeroinitializer, [[TMP8]] +; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP9]] +; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]] +; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i16.p0(ptr align 2 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = sext [[VP_OP_LOAD]] to +; IF-EVL-OUTLOOP-NEXT: [[TMP14]] = add [[VEC_PHI]], [[TMP13]] +; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = select [[TMP10]], [[TMP14]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]] +; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-OUTLOOP: middle.block: +; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP15]]) +; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL-OUTLOOP: scalar.ph: +; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-OUTLOOP: for.body: -; IF-EVL-OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; IF-EVL-OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[I_08]] -; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +; IF-EVL-OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; IF-EVL-OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]] +; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP18]] to i32 ; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] ; IF-EVL-OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-OUTLOOP: for.cond.cleanup.loopexit: -; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]] ; IF-EVL-OUTLOOP: for.cond.cleanup: ; IF-EVL-OUTLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index fc12dd54f88df..6f64cb822a43d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -25,20 +25,70 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-LABEL: define i32 @cond_add( ; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] { ; IF-EVL-OUTLOOP-NEXT: entry: +; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; IF-EVL-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL-OUTLOOP: vector.ph: +; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-OUTLOOP: vector.body: +; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT1:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV1]] +; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) +; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV1]], 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV1]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] +; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] +; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] +; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = select [[TMP18]], [[VP_OP_LOAD]], zeroinitializer +; IF-EVL-OUTLOOP-NEXT: [[TMP20]] = add [[TMP19]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = select [[TMP15]], [[TMP20]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT1]] = add i64 [[TMP22]], [[EVL_BASED_IV1]] +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]] +; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N_VEC]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-OUTLOOP: middle.block: +; IF-EVL-OUTLOOP-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP21]]) +; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-OUTLOOP: scalar.ph: +; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-OUTLOOP: for.body: -; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; IF-EVL-OUTLOOP-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP27]], 3 ; IF-EVL-OUTLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP27]], i32 0 ; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] -; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1 -; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-OUTLOOP: for.end: -; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: ret i32 [[ADD_LCSSA]] ; ; IF-EVL-INLOOP-LABEL: define i32 @cond_add( @@ -225,24 +275,76 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-LABEL: define i32 @cond_add_pred( ; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { ; IF-EVL-OUTLOOP-NEXT: entry: +; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; IF-EVL-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL-OUTLOOP: vector.ph: +; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-OUTLOOP: vector.body: +; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) +; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] +; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] +; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] +; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = add [[VEC_PHI]], [[VP_OP_LOAD]] +; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = xor [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = select [[TMP15]], [[TMP20]], zeroinitializer +; IF-EVL-OUTLOOP-NEXT: [[PREDPHI]] = select [[TMP21]], [[VEC_PHI]], [[TMP19]] +; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = select [[TMP15]], [[PREDPHI]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-OUTLOOP-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-OUTLOOP: middle.block: +; IF-EVL-OUTLOOP-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP22]]) +; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-OUTLOOP: scalar.ph: +; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-OUTLOOP: for.body: -; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[MIDDLE_BLOCK:%.*]] ] -; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP27:%.*]], [[MIDDLE_BLOCK]] ] +; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[FOR_INC]] ] ; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; IF-EVL-OUTLOOP-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP28]], 3 -; IF-EVL-OUTLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[MIDDLE_BLOCK]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; IF-EVL-OUTLOOP: if.then: -; IF-EVL-OUTLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[BC_MERGE_RDX]], [[TMP28]] -; IF-EVL-OUTLOOP-NEXT: br label [[MIDDLE_BLOCK]] +; IF-EVL-OUTLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[RDX]], [[TMP28]] +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_INC]] ; IF-EVL-OUTLOOP: for.inc: -; IF-EVL-OUTLOOP-NEXT: [[TMP27]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL-OUTLOOP: for.end: -; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] ; ; IF-EVL-INLOOP-LABEL: define i32 @cond_add_pred( @@ -469,7 +571,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL-OUTLOOP: for.end: ; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: ret i32 [[ADD_LCSSA]] @@ -659,7 +761,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[TMP37]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6]] ; IF-EVL-OUTLOOP: for.end: ; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] @@ -851,8 +953,14 @@ for.end: !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.vectorize.enable", i1 true} ;. -; IF-EVL-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} -; IF-EVL-OUTLOOP: [[META1]] = !{!"llvm.loop.vectorize.enable", i1 true} +; IF-EVL-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; IF-EVL-OUTLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; IF-EVL-OUTLOOP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; IF-EVL-OUTLOOP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; IF-EVL-OUTLOOP: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; IF-EVL-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META7:![0-9]+]]} +; IF-EVL-OUTLOOP: [[META7]] = !{!"llvm.loop.vectorize.enable", i1 true} ;. ; IF-EVL-INLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; IF-EVL-INLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll index 4f2fe3d167232..d60aeaf16cd12 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll @@ -1778,19 +1778,69 @@ for.end: define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-LABEL: @anyof_icmp( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP18]] = or [[VEC_PHI]], [[TMP17]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP19]]) +; IF-EVL-NEXT: [[TMP23:%.*]] = freeze i1 [[TMP22]] +; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP23]], i32 [[INV:%.*]], i32 [[START:%.*]] +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP0]], 3 -; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV:%.*]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP24]], 3 +; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]] ; ; NO-VP-LABEL: @anyof_icmp( @@ -1864,19 +1914,69 @@ for.end: define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-LABEL: @anyof_fcmp( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = fcmp fast olt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP18]] = or [[VEC_PHI]], [[TMP17]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP19]]) +; IF-EVL-NEXT: [[TMP23:%.*]] = freeze i1 [[TMP22]] +; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP23]], i32 [[INV:%.*]], i32 [[START:%.*]] +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP0]], 3.000000e+00 -; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV:%.*]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP24]], 3.000000e+00 +; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]] ; ; NO-VP-LABEL: @anyof_fcmp( diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll index fcea9e8d81ff6..d6675d139b4e2 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll @@ -25,17 +25,74 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-OUTLOOP-LABEL: define void @reduction_intermediate_store( ; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] { ; IF-EVL-OUTLOOP-NEXT: entry: +; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; IF-EVL-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; IF-EVL-OUTLOOP: vector.memcheck: +; IF-EVL-OUTLOOP-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ADDR]], i64 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = shl i64 [[N]], 2 +; IF-EVL-OUTLOOP-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]] +; IF-EVL-OUTLOOP-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ADDR]], [[SCEVGEP1]] +; IF-EVL-OUTLOOP-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] +; IF-EVL-OUTLOOP-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[ENTRY:%.*]] +; IF-EVL-OUTLOOP: vector.ph: +; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP7]] +; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL-OUTLOOP: vector.body: +; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP10]], [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) +; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = add zeroinitializer, [[TMP14]] +; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP15]] +; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT3]] +; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] +; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]), !alias.scope [[META0:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: [[TMP19]] = add [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = select [[TMP16]], [[TMP19]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64 +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP9]] +; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL-OUTLOOP: middle.block: +; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP20]]) +; IF-EVL-OUTLOOP-NEXT: store i32 [[TMP23]], ptr [[ADDR]], align 4 +; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-OUTLOOP: scalar.ph: +; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_MEMCHECK]] ], [ [[START]], [[ENTRY1]] ] +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL-OUTLOOP: for.body: -; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-OUTLOOP-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] +; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY1]] ] +; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] ; IF-EVL-OUTLOOP-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[TMP27]], [[RDX]] ; IF-EVL-OUTLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 -; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 +; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL-OUTLOOP: for.end: ; IF-EVL-OUTLOOP-NEXT: ret void ; @@ -236,8 +293,13 @@ for.end: !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.vectorize.enable", i1 true} ;. -; IF-EVL-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} -; IF-EVL-OUTLOOP: [[META1]] = !{!"llvm.loop.vectorize.enable", i1 true} +; IF-EVL-OUTLOOP: [[META0]] = !{[[META1:![0-9]+]]} +; IF-EVL-OUTLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} +; IF-EVL-OUTLOOP: [[META2]] = distinct !{[[META2]], !"LVerDomain"} +; IF-EVL-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]} +; IF-EVL-OUTLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1} +; IF-EVL-OUTLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} ;. ; IF-EVL-INLOOP: [[META0]] = !{[[META1:![0-9]+]]} ; IF-EVL-INLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll index f82611f414de2..405a4e2f14be2 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll @@ -12,18 +12,66 @@ define i32 @add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-LABEL: @add( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[START:%.*]], i32 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18]] = add [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP15]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[TMP23]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[ADD_LCSSA]] ; ; NO-VP-LABEL: @add( @@ -103,7 +151,7 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[MUL]] = mul nsw i32 [[TMP0]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: for.end: ; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ] ; IF-EVL-NEXT: ret i32 [[MUL_LCSSA]] @@ -174,18 +222,66 @@ for.end: define i32 @or(ptr %a, i64 %n, i32 %start) { ; IF-EVL-LABEL: @or( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[START:%.*]], i32 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18]] = or [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP15]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[OR:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[OR]] = or i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[OR]] = or i32 [[TMP23]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[OR_LCSSA]] ; ; NO-VP-LABEL: @or( @@ -255,18 +351,66 @@ for.end: define i32 @and(ptr %a, i64 %n, i32 %start) { ; IF-EVL-LABEL: @and( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement shufflevector ( insertelement ( poison, i32 -1, i64 0), poison, zeroinitializer), i32 [[START:%.*]], i32 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18]] = and [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP15]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[AND:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[AND]] = and i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[AND]] = and i32 [[TMP23]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[AND_LCSSA]] ; ; NO-VP-LABEL: @and( @@ -336,18 +480,66 @@ for.end: define i32 @xor(ptr %a, i64 %n, i32 %start) { ; IF-EVL-LABEL: @xor( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[START:%.*]], i32 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18]] = xor [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP15]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[XOR:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[XOR]] = xor i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[XOR]] = xor i32 [[TMP23]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[XOR_LCSSA]] ; ; NO-VP-LABEL: @xor( @@ -417,19 +609,69 @@ for.end: define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-LABEL: @smin( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp slt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP18]] = select [[TMP17]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP0]], [[RDX]] -; IF-EVL-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP23]], [[RDX]] +; IF-EVL-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP23]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[SMIN_LCSSA]] ; ; NO-VP-LABEL: @smin( @@ -503,19 +745,69 @@ for.end: define i32 @smax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-LABEL: @smax( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp sgt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP18]] = select [[TMP17]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP0]], [[RDX]] -; IF-EVL-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP23]], [[RDX]] +; IF-EVL-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP23]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[SMAX_LCSSA]] ; ; NO-VP-LABEL: @smax( @@ -589,19 +881,69 @@ for.end: define i32 @umin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-LABEL: @umin( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ult [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP18]] = select [[TMP17]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP0]], [[RDX]] -; IF-EVL-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP23]], [[RDX]] +; IF-EVL-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP23]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[UMIN_LCSSA]] ; ; NO-VP-LABEL: @umin( @@ -675,19 +1017,69 @@ for.end: define i32 @umax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-LABEL: @umax( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ugt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP18]] = select [[TMP17]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP0]], [[RDX]] -; IF-EVL-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP23]], [[RDX]] +; IF-EVL-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP23]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[UMAX_LCSSA]] ; ; NO-VP-LABEL: @umax( @@ -761,18 +1153,66 @@ for.end: define float @fadd(ptr %a, i64 %n, float %start) { ; IF-EVL-LABEL: @fadd( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[START:%.*]], i32 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18]] = fadd reassoc [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select reassoc [[TMP15]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP19]]) +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[ADD]] = fadd reassoc float [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD]] = fadd reassoc float [[TMP23]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret float [[ADD_LCSSA]] ; ; NO-VP-LABEL: @fadd( @@ -852,7 +1292,7 @@ define float @fmul(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: [[MUL]] = fmul reassoc float [[TMP0]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] ; IF-EVL: for.end: ; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ] ; IF-EVL-NEXT: ret float [[MUL_LCSSA]] @@ -923,19 +1363,69 @@ for.end: define float @fmin(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-LABEL: @fmin( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, float [[START:%.*]], i64 0 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = fcmp fast olt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP18]] = select [[TMP17]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select fast [[TMP14]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32( [[TMP19]]) +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MIN:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], [[RDX]] -; IF-EVL-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP0]], float [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP23]], [[RDX]] +; IF-EVL-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP23]], float [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret float [[MIN_LCSSA]] ; ; NO-VP-LABEL: @fmin( @@ -1009,19 +1499,69 @@ for.end: define float @fmax(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-LABEL: @fmax( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, float [[START:%.*]], i64 0 +; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = fcmp fast ogt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP18]] = select [[TMP17]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select fast [[TMP14]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32( [[TMP19]]) +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MAX:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP0]], [[RDX]] -; IF-EVL-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP0]], float [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP23]], [[RDX]] +; IF-EVL-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP23]], float [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret float [[MAX_LCSSA]] ; ; NO-VP-LABEL: @fmax( @@ -1104,7 +1644,7 @@ define float @fminimum(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]]) ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] ; IF-EVL: for.end: ; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ] ; IF-EVL-NEXT: ret float [[MIN_LCSSA]] @@ -1185,7 +1725,7 @@ define float @fmaximum(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]]) ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] ; IF-EVL: for.end: ; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ] ; IF-EVL-NEXT: ret float [[MAX_LCSSA]] @@ -1257,20 +1797,71 @@ for.end: define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) { ; IF-EVL-LABEL: @fmuladd( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[START:%.*]], i32 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP20]] = call reassoc @llvm.fmuladd.nxv4f32( [[VP_OP_LOAD]], [[VP_OP_LOAD3]], [[VEC_PHI]]) +; IF-EVL-NEXT: [[TMP21:%.*]] = select reassoc [[TMP15]], [[TMP20]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP22]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP24:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP21]]) +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[RDX]]) +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP25]], float [[TMP26]], float [[RDX]]) ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret float [[MULADD_LCSSA]] ; ; NO-VP-LABEL: @fmuladd( @@ -1347,19 +1938,69 @@ for.end: define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-LABEL: @anyof_icmp( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP18]] = or [[VEC_PHI]], [[TMP17]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP19]]) +; IF-EVL-NEXT: [[TMP23:%.*]] = freeze i1 [[TMP22]] +; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP23]], i32 [[INV:%.*]], i32 [[START:%.*]] +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP0]], 3 -; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV:%.*]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP24]], 3 +; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]] ; ; NO-VP-LABEL: @anyof_icmp( @@ -1433,19 +2074,69 @@ for.end: define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-LABEL: @anyof_fcmp( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = fcmp fast olt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP18]] = or [[VEC_PHI]], [[TMP17]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP19]]) +; IF-EVL-NEXT: [[TMP23:%.*]] = freeze i1 [[TMP22]] +; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP23]], i32 [[INV:%.*]], i32 [[START:%.*]] +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP0]], 3.000000e+00 -; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV:%.*]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP24]], 3.000000e+00 +; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]] ; ; NO-VP-LABEL: @anyof_fcmp( diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 77a9d105c85f3..3377ae0a9f74e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -24,7 +24,51 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { -; IF-EVL-OUTLOOP-NOT: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { +; IF-EVL-OUTLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { +; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF +; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[BEC:%[0-9]+]]> = backedge-taken count +; IF-EVL-OUTLOOP-NEXT: Live-in ir<%n> = original trip-count +; IF-EVL-OUTLOOP-EMPTY: +; IF-EVL-OUTLOOP: vector.ph: +; IF-EVL-OUTLOOP-NEXT: Successor(s): vector loop +; IF-EVL-OUTLOOP-EMPTY: +; IF-EVL-OUTLOOP-NEXT: vector loop: { +; IF-EVL-OUTLOOP-NEXT: vector.body: +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; IF-EVL-OUTLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]> +; IF-EVL-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%n> +; IF-EVL-OUTLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[VEC_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[EVL_PHI]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[ACTIVE:%.+]]> = icmp ule vp<[[VEC_IV]]>, vp<[[BEC]]> +; IF-EVL-OUTLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; IF-EVL-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> +; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> +; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD1]]>, ir<[[RDX_PHI]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_SELECT:%.+]]> = select vp<[[ACTIVE]]>, ir<[[ADD]]>, ir<[[RDX_PHI]]> +; IF-EVL-OUTLOOP-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> +; IF-EVL-OUTLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; IF-EVL-OUTLOOP-NEXT: No successors +; IF-EVL-OUTLOOP-NEXT: } +; IF-EVL-OUTLOOP-NEXT: Successor(s): middle.block +; IF-EVL-OUTLOOP-EMPTY: +; IF-EVL-OUTLOOP-NEXT: middle.block: +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]> +; IF-EVL-OUTLOOP-NEXT: EMIT branch-on-cond ir +; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph +; IF-EVL-OUTLOOP-EMPTY: +; IF-EVL-OUTLOOP-NEXT: ir-bb: +; IF-EVL-OUTLOOP-NEXT: No successors +; IF-EVL-OUTLOOP-EMPTY: +; IF-EVL-OUTLOOP-NEXT: scalar.ph: +; IF-EVL-OUTLOOP-NEXT: No successors +; IF-EVL-OUTLOOP-EMPTY: +; IF-EVL-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; IF-EVL-OUTLOOP-NEXT: } +; ; IF-EVL-INLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { ; IF-EVL-INLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF From 41823f08cb88c7eb048566b6e68b91ee6ce15439 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 25 Jun 2024 21:04:22 -0700 Subject: [PATCH 02/35] VPlan Pattern match for Instruction::Select --- .../Transforms/Vectorize/VPlanPatternMatch.h | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index b8d69f4cd394d..70365d0aba8b6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -55,6 +55,17 @@ template struct bind_ty { } }; +/// Match a specified VPValue. +struct specificval_ty { + const VPValue *Val; + + specificval_ty(const VPValue *V) : Val(V) {} //std::move? + + bool match(VPValue *VPV) { return VPV == Val; } +}; + +inline specificval_ty m_Specific(const VPValue *VPV) { return VPV; } + /// Match a specified integer value or vector of all elements of that /// value. \p BitWidth optionally specifies the bitwidth the matched constant /// must have. If it is 0, the matched constant can have any bitwidth. @@ -224,6 +235,16 @@ using AllBinaryRecipe_match = BinaryRecipe_match; +template +using TernaryRecipe_match = Recipe_match, + Opcode, Commutative, RecipeTys...>; + +template +using TernaryVPInstruction_match = + TernaryRecipe_match; + template inline UnaryVPInstruction_match m_VPInstruction(const Op0_t &Op0) { @@ -236,6 +257,12 @@ m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1) { return BinaryVPInstruction_match(Op0, Op1); } +template +inline TernaryVPInstruction_match +m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { + return TernaryVPInstruction_match(Op0, Op1, Op2); +} + template inline UnaryVPInstruction_match m_Not(const Op0_t &Op0) { @@ -371,6 +398,14 @@ inline VPScalarIVSteps_match m_ScalarIVSteps(const Op0_t &Op0, const Op1_t &Op1) { return VPScalarIVSteps_match(Op0, Op1); } + +template +inline TernaryVPInstruction_match +m_Select(const Op0_t &Cond, const Op1_t &LHS, const Op2_t &RHS) { + return m_VPInstruction(Cond, LHS, + RHS); +} + } // namespace VPlanPatternMatch } // namespace llvm From d4307d0c16e41e6422d41379b8accea2005c56fb Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 10 Jul 2024 01:50:05 -0700 Subject: [PATCH 03/35] Init implement TU select --- llvm/lib/Transforms/Vectorize/VPlan.h | 3 +++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index ed0ea98f35a9a..0ac87073f0bf9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1230,6 +1230,9 @@ class VPInstruction : public VPRecipeWithIRFlags, // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). PtrAdd, + // Selects elements from two vectors (second and third operand) based on a + // condition vector (first operand) and a pivot index (fourth operand). + MergeUntilPivot, }; private: diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6254ea1519181..bccdc51f26d14 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -64,6 +64,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::PtrAdd: + case VPInstruction::MergeUntilPivot: return false; default: return true; @@ -636,7 +637,18 @@ Value *VPInstruction::generate(VPTransformState &State) { } return NewPhi; } + case VPInstruction::MergeUntilPivot: { + assert(Part == 0 && "No unrolling expected for predicated vectorization."); + Value *Cond = State.get(getOperand(0), Part); + Value *OnTrue = State.get(getOperand(1), Part); + Value *OnFalse = State.get(getOperand(2), Part); + Value *Pivot = State.get(getOperand(3), VPIteration(0, 0)); + assert(Pivot->getType()->isIntegerTy() && "Pivot should be an integer."); + return Builder.CreateIntrinsic(Intrinsic::vp_merge, {OnTrue->getType()}, + {Cond, OnTrue, OnFalse, Pivot}, nullptr, + Name); + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -718,6 +730,9 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case VPInstruction::BranchOnCond: case VPInstruction::ResumePhi: return true; + case VPInstruction::MergeUntilPivot: + // Pivot must be an integer. + return Op == getOperand(3); }; llvm_unreachable("switch should return"); } @@ -736,6 +751,7 @@ bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { case VPInstruction::BranchOnCount: case VPInstruction::BranchOnCond: case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::MergeUntilPivot: return true; }; llvm_unreachable("switch should return"); @@ -802,6 +818,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::PtrAdd: O << "ptradd"; break; + case VPInstruction::MergeUntilPivot: + O << "merge-until-pivot"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } From 453bc3b37200856ff2bdd758d4674be320d8d833 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 10 Jul 2024 07:13:34 -0700 Subject: [PATCH 04/35] Transform select(HeaderMask, LHS, RHS) --- .../Transforms/Vectorize/VPlanTransforms.cpp | 13 +- .../LoopVectorize/RISCV/inloop-reduction.ll | 37 +- ...rize-force-tail-with-evl-cond-reduction.ll | 13 +- ...ze-force-tail-with-evl-inloop-reduction.ll | 82 +-- ...-force-tail-with-evl-intermediate-store.ll | 11 +- ...vectorize-force-tail-with-evl-reduction.ll | 582 +++++++----------- .../RISCV/vplan-vp-intrinsics-reduction.ll | 5 +- 7 files changed, 290 insertions(+), 453 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 21721dbce1e7c..ae10ebc8132ce 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1442,6 +1442,7 @@ void VPlanTransforms::addActiveLaneMask( /// Replace recipes with their EVL variants. static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { + using namespace llvm::VPlanPatternMatch; SmallVector HeaderMasks = collectAllHeaderMasks(Plan); VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { @@ -1480,7 +1481,17 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { TypeInfo.inferScalarType(Sel), Sel->getDebugLoc()); }) - + .Case([&](VPInstruction *VPI) { + VPValue *LHS, *RHS; + if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), + m_VPValue(RHS)))) + return nullptr; + VPValue *Cond = Plan.getOrAddLiveIn(ConstantInt::getTrue( + CanonicalIVPHI->getScalarType()->getContext())); + return new VPInstruction(VPInstruction::MergeUntilPivot, + {Cond, LHS, RHS, EVL}, + VPI->getDebugLoc()); + }) .Default([&](VPRecipeBase *R) { return nullptr; }); if (!NewRecipe) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll index c6305e21920fe..9105721b7042a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -130,54 +130,45 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP2]] ; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] ; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1 ; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() ; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL-OUTLOOP: vector.body: ; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP5]], i32 4, i1 true) ; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = add i32 [[EVL_BASED_IV]], 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[EVL_BASED_IV]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = call @llvm.experimental.stepvector.nxv4i32() -; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = add zeroinitializer, [[TMP8]] -; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP9]] -; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]] -; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 -; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i16.p0(ptr align 2 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) -; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = sext [[VP_OP_LOAD]] to -; IF-EVL-OUTLOOP-NEXT: [[TMP14]] = add [[VEC_PHI]], [[TMP13]] -; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = select [[TMP10]], [[TMP14]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]] +; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i16.p0(ptr align 2 [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = sext [[VP_OP_LOAD]] to +; IF-EVL-OUTLOOP-NEXT: [[TMP11]] = add [[VEC_PHI]], [[TMP10]] +; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP11]], [[VEC_PHI]], i32 [[TMP6]]) ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]] -; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: -; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP15]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP12]]) ; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; IF-EVL-OUTLOOP: scalar.ph: ; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-OUTLOOP: for.body: ; IF-EVL-OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IF-EVL-OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]] -; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP18]] to i32 +; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP15]] to i32 ; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] ; IF-EVL-OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-OUTLOOP: for.cond.cleanup.loopexit: -; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]] ; IF-EVL-OUTLOOP: for.cond.cleanup: ; IF-EVL-OUTLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 6f64cb822a43d..37d1d5944c71c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -37,12 +37,9 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL-OUTLOOP: vector.body: ; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -51,19 +48,13 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV1]] ; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) ; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV1]], 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV1]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] -; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] -; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] ; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] ; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) ; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = select [[TMP18]], [[VP_OP_LOAD]], zeroinitializer ; IF-EVL-OUTLOOP-NEXT: [[TMP20]] = add [[TMP19]], [[VEC_PHI]] -; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = select [[TMP15]], [[TMP20]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP20]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT1]] = add i64 [[TMP22]], [[EVL_BASED_IV1]] ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]] @@ -315,7 +306,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = xor [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = select [[TMP15]], [[TMP20]], zeroinitializer ; IF-EVL-OUTLOOP-NEXT: [[PREDPHI]] = select [[TMP21]], [[VEC_PHI]], [[TMP19]] -; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = select [[TMP15]], [[PREDPHI]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[PREDPHI]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll index d60aeaf16cd12..9bf04671ee9fd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll @@ -1790,40 +1790,31 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP18]] = or [[VEC_PHI]], [[TMP17]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP15]] = or [[VEC_PHI]], [[TMP14]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP19]]) -; IF-EVL-NEXT: [[TMP23:%.*]] = freeze i1 [[TMP22]] -; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP23]], i32 [[INV:%.*]], i32 [[START:%.*]] +; IF-EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP16]]) +; IF-EVL-NEXT: [[TMP20:%.*]] = freeze i1 [[TMP19]] +; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 [[INV:%.*]], i32 [[START:%.*]] ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -1833,8 +1824,8 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP24]], 3 +; IF-EVL-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP21]], 3 ; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] @@ -1926,40 +1917,31 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = fcmp fast olt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP18]] = or [[VEC_PHI]], [[TMP17]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast olt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP15]] = or [[VEC_PHI]], [[TMP14]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP19]]) -; IF-EVL-NEXT: [[TMP23:%.*]] = freeze i1 [[TMP22]] -; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP23]], i32 [[INV:%.*]], i32 [[START:%.*]] +; IF-EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP16]]) +; IF-EVL-NEXT: [[TMP20:%.*]] = freeze i1 [[TMP19]] +; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 [[INV:%.*]], i32 [[START:%.*]] ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -1969,8 +1951,8 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP24]], 3.000000e+00 +; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP21]], 3.000000e+00 ; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll index d6675d139b4e2..b3bb5d1949370 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll @@ -45,12 +45,9 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP7]] ; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] ; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-OUTLOOP: vector.body: ; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -59,17 +56,11 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) ; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = add zeroinitializer, [[TMP14]] -; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP15]] -; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT3]] ; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] ; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]), !alias.scope [[META0:![0-9]+]] ; IF-EVL-OUTLOOP-NEXT: [[TMP19]] = add [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = select [[TMP16]], [[TMP19]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP19]], [[VEC_PHI]], i32 [[TMP12]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64 ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP9]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll index 405a4e2f14be2..69f3315403311 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll @@ -24,54 +24,45 @@ define i32 @add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[START:%.*]], i32 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] -; IF-EVL-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP18]] = add [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP15]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15]] = add [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP16]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[TMP23]], [[RDX]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[TMP20]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[ADD_LCSSA]] ; ; NO-VP-LABEL: @add( @@ -234,54 +225,45 @@ define i32 @or(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[START:%.*]], i32 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] -; IF-EVL-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP18]] = or [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP15]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15]] = or [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32( [[TMP16]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[OR]] = or i32 [[TMP23]], [[RDX]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[OR]] = or i32 [[TMP20]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[OR_LCSSA]] ; ; NO-VP-LABEL: @or( @@ -363,54 +345,45 @@ define i32 @and(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement shufflevector ( insertelement ( poison, i32 -1, i64 0), poison, zeroinitializer), i32 [[START:%.*]], i32 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] -; IF-EVL-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP18]] = and [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP15]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15]] = and [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32( [[TMP16]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[AND]] = and i32 [[TMP23]], [[RDX]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[AND]] = and i32 [[TMP20]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[AND_LCSSA]] ; ; NO-VP-LABEL: @and( @@ -492,54 +465,45 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[START:%.*]], i32 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] -; IF-EVL-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP18]] = xor [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP15]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15]] = xor [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP16]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[XOR]] = xor i32 [[TMP23]], [[RDX]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[XOR]] = xor i32 [[TMP20]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[XOR_LCSSA]] ; ; NO-VP-LABEL: @xor( @@ -621,57 +585,48 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 ; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = icmp slt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP18]] = select [[TMP17]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP16]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP23]], [[RDX]] -; IF-EVL-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP23]], i32 [[RDX]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP20]], [[RDX]] +; IF-EVL-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP20]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[SMIN_LCSSA]] ; ; NO-VP-LABEL: @smin( @@ -757,57 +712,48 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 ; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = icmp sgt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP18]] = select [[TMP17]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp sgt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32( [[TMP16]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP23]], [[RDX]] -; IF-EVL-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP23]], i32 [[RDX]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP20]], [[RDX]] +; IF-EVL-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP20]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[SMAX_LCSSA]] ; ; NO-VP-LABEL: @smax( @@ -893,57 +839,48 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 ; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ult [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP18]] = select [[TMP17]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ult [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32( [[TMP16]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP23]], [[RDX]] -; IF-EVL-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP23]], i32 [[RDX]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP20]], [[RDX]] +; IF-EVL-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP20]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[UMIN_LCSSA]] ; ; NO-VP-LABEL: @umin( @@ -1029,57 +966,48 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 ; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ugt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP18]] = select [[TMP17]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ugt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32( [[TMP19]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32( [[TMP16]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP23]], [[RDX]] -; IF-EVL-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP23]], i32 [[RDX]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP20]], [[RDX]] +; IF-EVL-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP20]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[UMAX_LCSSA]] ; ; NO-VP-LABEL: @umax( @@ -1165,54 +1093,45 @@ define float @fadd(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[START:%.*]], i32 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] -; IF-EVL-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP18]] = fadd reassoc [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select reassoc [[TMP15]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15]] = fadd reassoc [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP19]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP16]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[ADD]] = fadd reassoc float [[TMP23]], [[RDX]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD]] = fadd reassoc float [[TMP20]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret float [[ADD_LCSSA]] ; ; NO-VP-LABEL: @fadd( @@ -1375,57 +1294,48 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, float [[START:%.*]], i64 0 ; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = fcmp fast olt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP18]] = select [[TMP17]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select fast [[TMP14]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast olt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32( [[TMP19]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32( [[TMP16]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP23]], [[RDX]] -; IF-EVL-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP23]], float [[RDX]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP20]], [[RDX]] +; IF-EVL-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP20]], float [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret float [[MIN_LCSSA]] ; ; NO-VP-LABEL: @fmin( @@ -1511,57 +1421,48 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, float [[START:%.*]], i64 0 ; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = fcmp fast ogt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP18]] = select [[TMP17]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select fast [[TMP14]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast ogt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32( [[TMP19]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32( [[TMP16]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP23]], [[RDX]] -; IF-EVL-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP23]], float [[RDX]] +; IF-EVL-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP20]], [[RDX]] +; IF-EVL-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP20]], float [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret float [[MAX_LCSSA]] ; ; NO-VP-LABEL: @fmax( @@ -1809,59 +1710,50 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[START:%.*]], i32 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] -; IF-EVL-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP20]] = call reassoc @llvm.fmuladd.nxv4f32( [[VP_OP_LOAD]], [[VP_OP_LOAD3]], [[VEC_PHI]]) -; IF-EVL-NEXT: [[TMP21:%.*]] = select reassoc [[TMP15]], [[TMP20]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP22]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP17]] = call reassoc @llvm.fmuladd.nxv4f32( [[VP_OP_LOAD]], [[VP_OP_LOAD1]], [[VEC_PHI]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP17]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP24:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP21]]) +; IF-EVL-NEXT: [[TMP21:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP18]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP25]], float [[TMP26]], float [[RDX]]) +; IF-EVL-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP22]], float [[TMP23]], float [[RDX]]) ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret float [[MULADD_LCSSA]] ; ; NO-VP-LABEL: @fmuladd( @@ -1950,40 +1842,31 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP18]] = or [[VEC_PHI]], [[TMP17]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP15]] = or [[VEC_PHI]], [[TMP14]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP19]]) -; IF-EVL-NEXT: [[TMP23:%.*]] = freeze i1 [[TMP22]] -; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP23]], i32 [[INV:%.*]], i32 [[START:%.*]] +; IF-EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP16]]) +; IF-EVL-NEXT: [[TMP20:%.*]] = freeze i1 [[TMP19]] +; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 [[INV:%.*]], i32 [[START:%.*]] ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -1993,8 +1876,8 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP24]], 3 +; IF-EVL-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP21]], 3 ; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] @@ -2086,40 +1969,31 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = fcmp fast olt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP18]] = or [[VEC_PHI]], [[TMP17]] -; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP18]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast olt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP15]] = or [[VEC_PHI]], [[TMP14]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP19]]) -; IF-EVL-NEXT: [[TMP23:%.*]] = freeze i1 [[TMP22]] -; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP23]], i32 [[INV:%.*]], i32 [[START:%.*]] +; IF-EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP16]]) +; IF-EVL-NEXT: [[TMP20:%.*]] = freeze i1 [[TMP19]] +; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 [[INV:%.*]], i32 [[START:%.*]] ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -2129,8 +2003,8 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP24]], 3.000000e+00 +; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP21]], 3.000000e+00 ; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 3377ae0a9f74e..05379d11c40a7 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -27,7 +27,6 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { ; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF ; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[BEC:%[0-9]+]]> = backedge-taken count ; IF-EVL-OUTLOOP-NEXT: Live-in ir<%n> = original trip-count ; IF-EVL-OUTLOOP-EMPTY: ; IF-EVL-OUTLOOP: vector.ph: @@ -40,13 +39,11 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%n> ; IF-EVL-OUTLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[VEC_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[EVL_PHI]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[ACTIVE:%.+]]> = icmp ule vp<[[VEC_IV]]>, vp<[[BEC]]> ; IF-EVL-OUTLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> ; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> ; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD1]]>, ir<[[RDX_PHI]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_SELECT:%.+]]> = select vp<[[ACTIVE]]>, ir<[[ADD]]>, ir<[[RDX_PHI]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_SELECT:%.+]]> = merge-until-pivot ir, ir<[[ADD]]>, ir<[[RDX_PHI]]>, vp<[[EVL]]> ; IF-EVL-OUTLOOP-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> From 5a4aba6afc94e71bfa1abc4fcc0b6d83a06a37a5 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 1 Aug 2024 05:21:28 -0700 Subject: [PATCH 05/35] Add vplan test for cond reduction with basic block. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 4 + .../RISCV/vplan-vp-intrinsics-reduction.ll | 204 ++++++++++++++++++ 2 files changed, 208 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ae10ebc8132ce..84f1ce44eab6e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1482,6 +1482,10 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { Sel->getDebugLoc()); }) .Case([&](VPInstruction *VPI) { + // TODO: Transform + // select(HeaderMask, LHS, blend(RHS, LHS/BlendMask)) + // into + // MergeUntilPivot(BlendMask, LHS, RHS, EVL) VPValue *LHS, *RHS; if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS)))) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 05379d11c40a7..bf5c49fdbfd64 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -225,5 +225,209 @@ for.end: ret i32 %add } +define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { +; IF-EVL-OUTLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { +; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF +; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[BTC:%[0-9]+]]> = backedge-taken count +; IF-EVL-OUTLOOP-NEXT: Live-in ir<%n> = original trip-count +; IF-EVL-OUTLOOP-EMPTY: +; IF-EVL-OUTLOOP: vector.ph: +; IF-EVL-OUTLOOP-NEXT: Successor(s): vector loop +; IF-EVL-OUTLOOP-EMPTY: +; IF-EVL-OUTLOOP-NEXT: vector loop: { +; IF-EVL-OUTLOOP-NEXT: vector.body: +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; IF-EVL-OUTLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]> +; IF-EVL-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%n> +; IF-EVL-OUTLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[WIV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[EVL_PHI]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[HEADER_MASK:%.+]]> = icmp ule vp<[[WIV]]>, vp<[[BTC]]> +; IF-EVL-OUTLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; IF-EVL-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> +; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> +; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[COND:%.+]]> = icmp sgt ir<[[LD1]]>, ir<3> +; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[RDX_PHI]]>, ir<[[LD1]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[NOT_COND:%.+]]> = not ir<[[COND]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[BLEND_MASK:%.+]]> = logical-and vp<[[HEADER_MASK]]>, vp<[[NOT_COND]]> +; IF-EVL-OUTLOOP-NEXT: BLEND ir<[[BLEND_ADD:%.+]]> = ir<[[ADD]]> ir<[[RDX_PHI]]>/vp<[[BLEND_MASK]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_SELECT:%.+]]> = merge-until-pivot ir, ir<[[BLEND_ADD]]>, ir<[[RDX_PHI]]>, vp<[[EVL]]> +; IF-EVL-OUTLOOP-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> +; IF-EVL-OUTLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; IF-EVL-OUTLOOP-NEXT: No successors +; IF-EVL-OUTLOOP-NEXT: } +; IF-EVL-OUTLOOP-NEXT: Successor(s): middle.block +; IF-EVL-OUTLOOP-EMPTY: +; IF-EVL-OUTLOOP-NEXT: middle.block: +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]> +; IF-EVL-OUTLOOP-NEXT: EMIT branch-on-cond ir +; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph +; IF-EVL-OUTLOOP-EMPTY: +; IF-EVL-OUTLOOP-NEXT: ir-bb: +; IF-EVL-OUTLOOP-NEXT: No successors +; IF-EVL-OUTLOOP-EMPTY: +; IF-EVL-OUTLOOP-NEXT: scalar.ph: +; IF-EVL-OUTLOOP-NEXT: No successors +; IF-EVL-OUTLOOP-EMPTY: +; IF-EVL-OUTLOOP-NEXT: Live-out i32 %rdx.add.lcssa = vp<[[RDX]]> +; IF-EVL-OUTLOOP-NEXT: } +; + +; IF-EVL-INLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { +; IF-EVL-INLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF +; IF-EVL-INLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; IF-EVL-INLOOP-NEXT: Live-in vp<[[BTC:%[0-9]+]]> = backedge-taken count +; IF-EVL-INLOOP-NEXT: Live-in ir<%n> = original trip-count +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP: vector.ph: +; IF-EVL-INLOOP-NEXT: Successor(s): vector loop +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: vector loop: { +; IF-EVL-INLOOP-NEXT: vector.body: +; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; IF-EVL-INLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]> +; IF-EVL-INLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> +; IF-EVL-INLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%n> +; IF-EVL-INLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> +; IF-EVL-INLOOP-NEXT: EMIT vp<[[WIV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[EVL_PHI]]> +; IF-EVL-INLOOP-NEXT: EMIT vp<[[HEADER_MASK:%.+]]> = icmp ule vp<[[WIV]]>, vp<[[BTC]]> +; IF-EVL-INLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; IF-EVL-INLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> +; IF-EVL-INLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> +; IF-EVL-INLOOP-NEXT: WIDEN ir<[[COND:%.+]]> = icmp sgt ir<[[LD1]]>, ir<3> +; IF-EVL-INLOOP-NEXT: EMIT vp<[[MASK:%.+]]> = logical-and vp<[[HEADER_MASK]]>, ir<[[COND]]> +; IF-EVL-INLOOP-NEXT: REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + vp.reduce.add (ir<[[LD1]]>, vp<[[EVL]]>, vp<[[MASK]]>) +; IF-EVL-INLOOP-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 +; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> +; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> +; IF-EVL-INLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; IF-EVL-INLOOP-NEXT: No successors +; IF-EVL-INLOOP-NEXT: } +; IF-EVL-INLOOP-NEXT: Successor(s): middle.block +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: middle.block: +; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; IF-EVL-INLOOP-NEXT: EMIT branch-on-cond ir +; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: ir-bb: +; IF-EVL-INLOOP-NEXT: No successors +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: scalar.ph: +; IF-EVL-INLOOP-NEXT: No successors +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: Live-out i32 %rdx.add.lcssa = vp<[[RDX]]> +; IF-EVL-INLOOP-NEXT: } +; + +; NO-VP-OUTLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; NO-VP-OUTLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF +; NO-VP-OUTLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; NO-VP-OUTLOOP-NEXT: Live-in ir<%n> = original trip-count +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP: vector.ph: +; NO-VP-OUTLOOP-NEXT: Successor(s): vector loop +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: vector loop: { +; NO-VP-OUTLOOP-NEXT: vector.body: +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; NO-VP-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> +; NO-VP-OUTLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; NO-VP-OUTLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; NO-VP-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> +; NO-VP-OUTLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]> +; NO-VP-OUTLOOP-NEXT: WIDEN ir<[[COND:%.+]]> = icmp sgt ir<[[LD1]]>, ir<3> +; NO-VP-OUTLOOP-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[RDX_PHI]]>, ir<[[LD1]]> +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[NOT_COND:%.+]]> = not ir<[[COND]]> +; NO-VP-OUTLOOP-NEXT: BLEND ir<[[BLEND_ADD:%.+]]> = ir<[[ADD]]> ir<[[RDX_PHI]]>/vp<[[NOT_COND]]> +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; NO-VP-OUTLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; NO-VP-OUTLOOP-NEXT: No successors +; NO-VP-OUTLOOP-NEXT: } +; NO-VP-OUTLOOP-NEXT: Successor(s): middle.block +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: middle.block: +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[BLEND_ADD]]> +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> +; NO-VP-OUTLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]> +; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: ir-bb: +; NO-VP-OUTLOOP-NEXT: No successors +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: scalar.ph: +; NO-VP-OUTLOOP-NEXT: No successors +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: Live-out i32 %rdx.add.lcssa = vp<[[RDX]]> +; NO-VP-OUTLOOP-NEXT: } +; + +; NO-VP-INLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; NO-VP-INLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF +; NO-VP-INLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; NO-VP-INLOOP-NEXT: Live-in ir<%n> = original trip-count +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP: vector.ph: +; NO-VP-INLOOP-NEXT: Successor(s): vector loop +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: vector loop: { +; NO-VP-INLOOP-NEXT: vector.body: +; NO-VP-INLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; NO-VP-INLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> +; NO-VP-INLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; NO-VP-INLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; NO-VP-INLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> +; NO-VP-INLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]> +; NO-VP-INLOOP-NEXT: WIDEN ir<[[COND:%.+]]> = icmp sgt ir<[[LD1]]>, ir<3> +; NO-VP-INLOOP-NEXT: REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + reduce.add (ir<[[LD1]]>, ir<[[COND]]>) +; NO-VP-INLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; NO-VP-INLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; NO-VP-INLOOP-NEXT: No successors +; NO-VP-INLOOP-NEXT: } +; NO-VP-INLOOP-NEXT: Successor(s): middle.block +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: middle.block: +; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; NO-VP-INLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> +; NO-VP-INLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]> +; NO-VP-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: ir-bb: +; NO-VP-INLOOP-NEXT: No successors +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: scalar.ph: +; NO-VP-INLOOP-NEXT: No successors +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: Live-out i32 %rdx.add.lcssa = vp<[[RDX]]> +; NO-VP-INLOOP-NEXT: } +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %rdx = phi i32 [ %start, %entry ], [ %rdx.add, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp = icmp sgt i32 %0, 3 + br i1 %cmp, label %if.then, label %for.inc + +if.then: + %add.pred = add nsw i32 %rdx, %0 + br label %for.inc + +for.inc: + %rdx.add = phi i32 [ %add.pred, %if.then ], [ %rdx, %for.body ] + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %rdx.add +} + !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.vectorize.enable", i1 true} From 7b21944994837cefcc9e4a815ddef864723b9566 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 1 Aug 2024 18:42:05 -0700 Subject: [PATCH 06/35] Revert "Add vplan test for cond reduction with basic block." This reverts commit fce239f21979fd4f70f72f51466a277f8d87bc5a. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 4 - .../RISCV/vplan-vp-intrinsics-reduction.ll | 204 ------------------ 2 files changed, 208 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 84f1ce44eab6e..ae10ebc8132ce 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1482,10 +1482,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { Sel->getDebugLoc()); }) .Case([&](VPInstruction *VPI) { - // TODO: Transform - // select(HeaderMask, LHS, blend(RHS, LHS/BlendMask)) - // into - // MergeUntilPivot(BlendMask, LHS, RHS, EVL) VPValue *LHS, *RHS; if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS)))) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index bf5c49fdbfd64..05379d11c40a7 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -225,209 +225,5 @@ for.end: ret i32 %add } -define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { -; IF-EVL-OUTLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[BTC:%[0-9]+]]> = backedge-taken count -; IF-EVL-OUTLOOP-NEXT: Live-in ir<%n> = original trip-count -; IF-EVL-OUTLOOP-EMPTY: -; IF-EVL-OUTLOOP: vector.ph: -; IF-EVL-OUTLOOP-NEXT: Successor(s): vector loop -; IF-EVL-OUTLOOP-EMPTY: -; IF-EVL-OUTLOOP-NEXT: vector loop: { -; IF-EVL-OUTLOOP-NEXT: vector.body: -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-OUTLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]> -; IF-EVL-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%n> -; IF-EVL-OUTLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[WIV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[EVL_PHI]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[HEADER_MASK:%.+]]> = icmp ule vp<[[WIV]]>, vp<[[BTC]]> -; IF-EVL-OUTLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[COND:%.+]]> = icmp sgt ir<[[LD1]]>, ir<3> -; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[RDX_PHI]]>, ir<[[LD1]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[NOT_COND:%.+]]> = not ir<[[COND]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[BLEND_MASK:%.+]]> = logical-and vp<[[HEADER_MASK]]>, vp<[[NOT_COND]]> -; IF-EVL-OUTLOOP-NEXT: BLEND ir<[[BLEND_ADD:%.+]]> = ir<[[ADD]]> ir<[[RDX_PHI]]>/vp<[[BLEND_MASK]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_SELECT:%.+]]> = merge-until-pivot ir, ir<[[BLEND_ADD]]>, ir<[[RDX_PHI]]>, vp<[[EVL]]> -; IF-EVL-OUTLOOP-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-OUTLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-OUTLOOP-NEXT: No successors -; IF-EVL-OUTLOOP-NEXT: } -; IF-EVL-OUTLOOP-NEXT: Successor(s): middle.block -; IF-EVL-OUTLOOP-EMPTY: -; IF-EVL-OUTLOOP-NEXT: middle.block: -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]> -; IF-EVL-OUTLOOP-NEXT: EMIT branch-on-cond ir -; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph -; IF-EVL-OUTLOOP-EMPTY: -; IF-EVL-OUTLOOP-NEXT: ir-bb: -; IF-EVL-OUTLOOP-NEXT: No successors -; IF-EVL-OUTLOOP-EMPTY: -; IF-EVL-OUTLOOP-NEXT: scalar.ph: -; IF-EVL-OUTLOOP-NEXT: No successors -; IF-EVL-OUTLOOP-EMPTY: -; IF-EVL-OUTLOOP-NEXT: Live-out i32 %rdx.add.lcssa = vp<[[RDX]]> -; IF-EVL-OUTLOOP-NEXT: } -; - -; IF-EVL-INLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-INLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-INLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-INLOOP-NEXT: Live-in vp<[[BTC:%[0-9]+]]> = backedge-taken count -; IF-EVL-INLOOP-NEXT: Live-in ir<%n> = original trip-count -; IF-EVL-INLOOP-EMPTY: -; IF-EVL-INLOOP: vector.ph: -; IF-EVL-INLOOP-NEXT: Successor(s): vector loop -; IF-EVL-INLOOP-EMPTY: -; IF-EVL-INLOOP-NEXT: vector loop: { -; IF-EVL-INLOOP-NEXT: vector.body: -; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-INLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]> -; IF-EVL-INLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> -; IF-EVL-INLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%n> -; IF-EVL-INLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> -; IF-EVL-INLOOP-NEXT: EMIT vp<[[WIV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[EVL_PHI]]> -; IF-EVL-INLOOP-NEXT: EMIT vp<[[HEADER_MASK:%.+]]> = icmp ule vp<[[WIV]]>, vp<[[BTC]]> -; IF-EVL-INLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-INLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-INLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-INLOOP-NEXT: WIDEN ir<[[COND:%.+]]> = icmp sgt ir<[[LD1]]>, ir<3> -; IF-EVL-INLOOP-NEXT: EMIT vp<[[MASK:%.+]]> = logical-and vp<[[HEADER_MASK]]>, ir<[[COND]]> -; IF-EVL-INLOOP-NEXT: REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + vp.reduce.add (ir<[[LD1]]>, vp<[[EVL]]>, vp<[[MASK]]>) -; IF-EVL-INLOOP-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-INLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-INLOOP-NEXT: No successors -; IF-EVL-INLOOP-NEXT: } -; IF-EVL-INLOOP-NEXT: Successor(s): middle.block -; IF-EVL-INLOOP-EMPTY: -; IF-EVL-INLOOP-NEXT: middle.block: -; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> -; IF-EVL-INLOOP-NEXT: EMIT branch-on-cond ir -; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph -; IF-EVL-INLOOP-EMPTY: -; IF-EVL-INLOOP-NEXT: ir-bb: -; IF-EVL-INLOOP-NEXT: No successors -; IF-EVL-INLOOP-EMPTY: -; IF-EVL-INLOOP-NEXT: scalar.ph: -; IF-EVL-INLOOP-NEXT: No successors -; IF-EVL-INLOOP-EMPTY: -; IF-EVL-INLOOP-NEXT: Live-out i32 %rdx.add.lcssa = vp<[[RDX]]> -; IF-EVL-INLOOP-NEXT: } -; - -; NO-VP-OUTLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { -; NO-VP-OUTLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; NO-VP-OUTLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; NO-VP-OUTLOOP-NEXT: Live-in ir<%n> = original trip-count -; NO-VP-OUTLOOP-EMPTY: -; NO-VP-OUTLOOP: vector.ph: -; NO-VP-OUTLOOP-NEXT: Successor(s): vector loop -; NO-VP-OUTLOOP-EMPTY: -; NO-VP-OUTLOOP-NEXT: vector loop: { -; NO-VP-OUTLOOP-NEXT: vector.body: -; NO-VP-OUTLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; NO-VP-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> -; NO-VP-OUTLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> -; NO-VP-OUTLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; NO-VP-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; NO-VP-OUTLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]> -; NO-VP-OUTLOOP-NEXT: WIDEN ir<[[COND:%.+]]> = icmp sgt ir<[[LD1]]>, ir<3> -; NO-VP-OUTLOOP-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[RDX_PHI]]>, ir<[[LD1]]> -; NO-VP-OUTLOOP-NEXT: EMIT vp<[[NOT_COND:%.+]]> = not ir<[[COND]]> -; NO-VP-OUTLOOP-NEXT: BLEND ir<[[BLEND_ADD:%.+]]> = ir<[[ADD]]> ir<[[RDX_PHI]]>/vp<[[NOT_COND]]> -; NO-VP-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> -; NO-VP-OUTLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; NO-VP-OUTLOOP-NEXT: No successors -; NO-VP-OUTLOOP-NEXT: } -; NO-VP-OUTLOOP-NEXT: Successor(s): middle.block -; NO-VP-OUTLOOP-EMPTY: -; NO-VP-OUTLOOP-NEXT: middle.block: -; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[BLEND_ADD]]> -; NO-VP-OUTLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> -; NO-VP-OUTLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]> -; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph -; NO-VP-OUTLOOP-EMPTY: -; NO-VP-OUTLOOP-NEXT: ir-bb: -; NO-VP-OUTLOOP-NEXT: No successors -; NO-VP-OUTLOOP-EMPTY: -; NO-VP-OUTLOOP-NEXT: scalar.ph: -; NO-VP-OUTLOOP-NEXT: No successors -; NO-VP-OUTLOOP-EMPTY: -; NO-VP-OUTLOOP-NEXT: Live-out i32 %rdx.add.lcssa = vp<[[RDX]]> -; NO-VP-OUTLOOP-NEXT: } -; - -; NO-VP-INLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { -; NO-VP-INLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; NO-VP-INLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; NO-VP-INLOOP-NEXT: Live-in ir<%n> = original trip-count -; NO-VP-INLOOP-EMPTY: -; NO-VP-INLOOP: vector.ph: -; NO-VP-INLOOP-NEXT: Successor(s): vector loop -; NO-VP-INLOOP-EMPTY: -; NO-VP-INLOOP-NEXT: vector loop: { -; NO-VP-INLOOP-NEXT: vector.body: -; NO-VP-INLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; NO-VP-INLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> -; NO-VP-INLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> -; NO-VP-INLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; NO-VP-INLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; NO-VP-INLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]> -; NO-VP-INLOOP-NEXT: WIDEN ir<[[COND:%.+]]> = icmp sgt ir<[[LD1]]>, ir<3> -; NO-VP-INLOOP-NEXT: REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + reduce.add (ir<[[LD1]]>, ir<[[COND]]>) -; NO-VP-INLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> -; NO-VP-INLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; NO-VP-INLOOP-NEXT: No successors -; NO-VP-INLOOP-NEXT: } -; NO-VP-INLOOP-NEXT: Successor(s): middle.block -; NO-VP-INLOOP-EMPTY: -; NO-VP-INLOOP-NEXT: middle.block: -; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> -; NO-VP-INLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> -; NO-VP-INLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]> -; NO-VP-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph -; NO-VP-INLOOP-EMPTY: -; NO-VP-INLOOP-NEXT: ir-bb: -; NO-VP-INLOOP-NEXT: No successors -; NO-VP-INLOOP-EMPTY: -; NO-VP-INLOOP-NEXT: scalar.ph: -; NO-VP-INLOOP-NEXT: No successors -; NO-VP-INLOOP-EMPTY: -; NO-VP-INLOOP-NEXT: Live-out i32 %rdx.add.lcssa = vp<[[RDX]]> -; NO-VP-INLOOP-NEXT: } -; -entry: - br label %for.body - -for.body: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] - %rdx = phi i32 [ %start, %entry ], [ %rdx.add, %for.inc ] - %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv - %0 = load i32, ptr %arrayidx, align 4 - %cmp = icmp sgt i32 %0, 3 - br i1 %cmp, label %if.then, label %for.inc - -if.then: - %add.pred = add nsw i32 %rdx, %0 - br label %for.inc - -for.inc: - %rdx.add = phi i32 [ %add.pred, %if.then ], [ %rdx, %for.body ] - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %n - br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 - -for.end: - ret i32 %rdx.add -} - !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.vectorize.enable", i1 true} From 8660beb076ad218f6f252d469318ea6428bee611 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 1 Aug 2024 18:59:06 -0700 Subject: [PATCH 07/35] Sort code and comments. --- llvm/lib/Transforms/Vectorize/VPlan.h | 4 +++- llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h | 2 +- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 - 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0ac87073f0bf9..97146c1f0c12b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1231,7 +1231,9 @@ class VPInstruction : public VPRecipeWithIRFlags, // for all lanes, depending on its uses). PtrAdd, // Selects elements from two vectors (second and third operand) based on a - // condition vector (first operand) and a pivot index (fourth operand). + // condition vector (first operand) and a pivot index (fourth operand). The + // lanes whose positions are greater than or equal to the pivot are taken + // from the third operand. MergeUntilPivot, }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 70365d0aba8b6..29b220d29d93b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -59,7 +59,7 @@ template struct bind_ty { struct specificval_ty { const VPValue *Val; - specificval_ty(const VPValue *V) : Val(V) {} //std::move? + specificval_ty(const VPValue *V) : Val(V) {} bool match(VPValue *VPV) { return VPV == Val; } }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index bccdc51f26d14..75d19320e9224 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -644,7 +644,6 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *OnFalse = State.get(getOperand(2), Part); Value *Pivot = State.get(getOperand(3), VPIteration(0, 0)); assert(Pivot->getType()->isIntegerTy() && "Pivot should be an integer."); - return Builder.CreateIntrinsic(Intrinsic::vp_merge, {OnTrue->getType()}, {Cond, OnTrue, OnFalse, Pivot}, nullptr, Name); From fd5a5de4539e4e4f9e8a85ddb056089e2fe71abd Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 12 Aug 2024 02:56:59 -0700 Subject: [PATCH 08/35] Force to predicate a reduction operation when fold by EVL. --- .../Transforms/Vectorize/LoopVectorize.cpp | 17 +++- .../LoopVectorize/RISCV/inloop-reduction.ll | 65 +++++++++++--- ...rize-force-tail-with-evl-cond-reduction.ll | 12 +-- ...ze-force-tail-with-evl-inloop-reduction.ll | 12 +-- ...-force-tail-with-evl-intermediate-store.ll | 6 +- ...vectorize-force-tail-with-evl-reduction.ll | 84 +++++++++---------- .../RISCV/vplan-vp-intrinsics-reduction.ll | 4 +- 7 files changed, 125 insertions(+), 75 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 35113c204b3c1..4eed377a51e5e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1484,6 +1484,17 @@ class LoopVectorizationCostModel { return InLoopReductions.contains(Phi); } + /// Returns true if the predicated reduction select should be used. + bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const { + // Force to use predicated reduction select since the EVL of the + // second-to-last iteration might not be VF*UF. + if (foldTailWithEVL()) + return true; + return PreferPredicatedReductionSelect || + TTI.preferPredicatedReductionSelect( + Opcode, PhiTy, TargetTransformInfo::ReductionFlags()); + } + /// Estimate cost of an intrinsic call instruction CI if it were vectorized /// with factor VF. Return the cost of the instruction, including /// scalarization overhead if it's needed. @@ -9453,10 +9464,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( cast(&U)->getOpcode() == VPInstruction::ComputeReductionResult; }); - if (PreferPredicatedReductionSelect || - TTI.preferPredicatedReductionSelect( - PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy, - TargetTransformInfo::ReductionFlags())) + if (CM.usePredicatedReductionSelect( + PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy)) PhiR->setOperand(1, NewExitingVPV); } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll index 9105721b7042a..17ee137e1d8e4 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -136,7 +136,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; IF-EVL-OUTLOOP: vector.body: ; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP5]], i32 4, i1 true) ; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = add i32 [[EVL_BASED_IV]], 0 @@ -144,8 +144,8 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0 ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i16.p0(ptr align 2 [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = sext [[VP_OP_LOAD]] to -; IF-EVL-OUTLOOP-NEXT: [[TMP11]] = add [[VEC_PHI]], [[TMP10]] -; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP11]], [[VEC_PHI]], i32 [[TMP6]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = add [[VEC_PHI]], [[TMP10]] +; IF-EVL-OUTLOOP-NEXT: [[TMP12]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP11]], [[VEC_PHI]], i32 [[TMP6]]) ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]] ; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -350,19 +350,60 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; ; IF-EVL-OUTLOOP-LABEL: @smin( ; IF-EVL-OUTLOOP-NEXT: entry: +; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL-OUTLOOP: vector.ph: +; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-OUTLOOP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-OUTLOOP: vector.body: +; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-OUTLOOP: middle.block: +; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP16]]) +; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-OUTLOOP: scalar.ph: +; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-OUTLOOP: for.body: -; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-OUTLOOP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP0]], [[RDX]] -; IF-EVL-OUTLOOP-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]] +; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-OUTLOOP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP20]], [[RDX]] +; IF-EVL-OUTLOOP-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP20]], i32 [[RDX]] ; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL-OUTLOOP: for.end: -; IF-EVL-OUTLOOP-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: ret i32 [[SMIN_LCSSA]] ; ; IF-EVL-INLOOP-LABEL: @smin( diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 37d1d5944c71c..79eaf42c22e96 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -53,15 +53,15 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) ; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = select [[TMP18]], [[VP_OP_LOAD]], zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[TMP20]] = add [[TMP19]], [[VEC_PHI]] -; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP20]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = add [[TMP19]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP20]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP21]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT1]] = add i64 [[TMP22]], [[EVL_BASED_IV1]] ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]] ; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N_VEC]] ; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: -; IF-EVL-OUTLOOP-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP21]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP20]]) ; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL-OUTLOOP: scalar.ph: ; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -305,15 +305,15 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = add [[VEC_PHI]], [[VP_OP_LOAD]] ; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = xor [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = select [[TMP15]], [[TMP20]], zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[PREDPHI]] = select [[TMP21]], [[VEC_PHI]], [[TMP19]] -; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[PREDPHI]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-OUTLOOP-NEXT: [[PREDPHI1:%.*]] = select [[TMP21]], [[VEC_PHI]], [[TMP19]] +; IF-EVL-OUTLOOP-NEXT: [[PREDPHI]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[PREDPHI1]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] ; IF-EVL-OUTLOOP-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: -; IF-EVL-OUTLOOP-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP22]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PREDPHI]]) ; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL-OUTLOOP: scalar.ph: ; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll index 9bf04671ee9fd..8e4eb9c62d8f9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll @@ -1796,7 +1796,7 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 @@ -1804,8 +1804,8 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP15]] = or [[VEC_PHI]], [[TMP14]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = or [[VEC_PHI]], [[TMP14]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -1923,7 +1923,7 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 @@ -1931,8 +1931,8 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast olt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP15]] = or [[VEC_PHI]], [[TMP14]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = or [[VEC_PHI]], [[TMP14]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll index b3bb5d1949370..7e764f528dd3e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll @@ -59,15 +59,15 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] ; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]), !alias.scope [[META0:![0-9]+]] -; IF-EVL-OUTLOOP-NEXT: [[TMP19]] = add [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP19]], [[VEC_PHI]], i32 [[TMP12]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = add [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP19]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP16]], [[VEC_PHI]], i32 [[TMP12]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64 ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP9]] ; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: -; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP20]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; IF-EVL-OUTLOOP-NEXT: store i32 [[TMP23]], ptr [[ADDR]], align 4 ; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL-OUTLOOP: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll index 69f3315403311..de4631a006535 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll @@ -31,15 +31,15 @@ define i32 @add(ptr %a, i64 %n, i32 %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP15]] = add [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = add [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -232,15 +232,15 @@ define i32 @or(ptr %a, i64 %n, i32 %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP15]] = or [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = or [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -352,15 +352,15 @@ define i32 @and(ptr %a, i64 %n, i32 %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP15]] = and [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = and [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -472,15 +472,15 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP15]] = xor [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = xor [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -593,7 +593,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 @@ -601,8 +601,8 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP15]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -720,7 +720,7 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 @@ -728,8 +728,8 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp sgt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP15]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -847,7 +847,7 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 @@ -855,8 +855,8 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ult [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP15]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -974,7 +974,7 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 @@ -982,8 +982,8 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ugt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP15]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -1100,15 +1100,15 @@ define float @fadd(ptr %a, i64 %n, float %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP12]] ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP15]] = fadd reassoc [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = fadd reassoc [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -1302,7 +1302,7 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 @@ -1310,8 +1310,8 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast olt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP15]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -1429,7 +1429,7 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 @@ -1437,8 +1437,8 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast ogt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP15]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -1717,7 +1717,7 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 @@ -1727,8 +1727,8 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) { ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP12]] ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP17]] = call reassoc @llvm.fmuladd.nxv4f32( [[VP_OP_LOAD]], [[VP_OP_LOAD1]], [[VEC_PHI]]) -; IF-EVL-NEXT: [[TMP18:%.*]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP17]], [[VEC_PHI]], i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = call reassoc @llvm.fmuladd.nxv4f32( [[VP_OP_LOAD]], [[VP_OP_LOAD1]], [[VEC_PHI]]) +; IF-EVL-NEXT: [[TMP18]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP17]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -1848,7 +1848,7 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 @@ -1856,8 +1856,8 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP15]] = or [[VEC_PHI]], [[TMP14]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = or [[VEC_PHI]], [[TMP14]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] @@ -1975,7 +1975,7 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 @@ -1983,8 +1983,8 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast olt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP15]] = or [[VEC_PHI]], [[TMP14]] -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = or [[VEC_PHI]], [[TMP14]] +; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 05379d11c40a7..a3d9f00a6820d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -36,14 +36,14 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: vector.body: ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-OUTLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]> -; IF-EVL-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> +; IF-EVL-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, vp<[[RDX_SELECT:%.+]]> ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%n> ; IF-EVL-OUTLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> ; IF-EVL-OUTLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> ; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> ; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD1]]>, ir<[[RDX_PHI]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_SELECT:%.+]]> = merge-until-pivot ir, ir<[[ADD]]>, ir<[[RDX_PHI]]>, vp<[[EVL]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_SELECT]]> = merge-until-pivot ir, ir<[[ADD]]>, ir<[[RDX_PHI]]>, vp<[[EVL]]> ; IF-EVL-OUTLOOP-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> From 6052fa8f83010b025b013bf3555af707654e1135 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Fri, 30 Aug 2024 02:13:59 -0700 Subject: [PATCH 09/35] Rebase and update test case. --- .../RISCV/vectorize-force-tail-with-evl-cond-reduction.ll | 2 +- .../LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 79eaf42c22e96..7c1a217326c3e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -294,7 +294,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 ; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 ; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = call @llvm.stepvector.nxv4i64() ; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] ; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] ; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index a3d9f00a6820d..23788c941190a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -54,6 +54,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-EMPTY: ; IF-EVL-OUTLOOP-NEXT: middle.block: ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_EXTRACT:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> ; IF-EVL-OUTLOOP-NEXT: EMIT branch-on-cond ir ; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph ; IF-EVL-OUTLOOP-EMPTY: @@ -63,7 +64,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: scalar.ph: ; IF-EVL-OUTLOOP-NEXT: No successors ; IF-EVL-OUTLOOP-EMPTY: -; IF-EVL-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; IF-EVL-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EXTRACT]]> ; IF-EVL-OUTLOOP-NEXT: } ; From 32556ddb8231d3739cb5f83b4d755a47844899cf Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 3 Oct 2024 02:13:23 -0700 Subject: [PATCH 10/35] Rebase and updated --- llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h | 7 ++++++- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 9 ++++----- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 13 +++++++------ 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 29b220d29d93b..838e5479a2e5a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -61,7 +61,7 @@ struct specificval_ty { specificval_ty(const VPValue *V) : Val(V) {} - bool match(VPValue *VPV) { return VPV == Val; } + bool match(VPValue *VPV) const { return VPV == Val; } }; inline specificval_ty m_Specific(const VPValue *VPV) { return VPV; } @@ -178,6 +178,11 @@ struct Recipe_match { static_assert(std::tuple_size::value == 2 && "constructor can only be used for binary matcher"); } + template + Recipe_match(A_t A, B_t B, C_t C) : Ops({A, B, C}) { + static_assert(std::tuple_size::value == 3 && + "constructor can only be used for ternary matcher"); + } bool match(const VPValue *V) const { auto *DefR = V->getDefiningRecipe(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 75d19320e9224..d45298846d963 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -638,11 +638,10 @@ Value *VPInstruction::generate(VPTransformState &State) { return NewPhi; } case VPInstruction::MergeUntilPivot: { - assert(Part == 0 && "No unrolling expected for predicated vectorization."); - Value *Cond = State.get(getOperand(0), Part); - Value *OnTrue = State.get(getOperand(1), Part); - Value *OnFalse = State.get(getOperand(2), Part); - Value *Pivot = State.get(getOperand(3), VPIteration(0, 0)); + Value *Cond = State.get(getOperand(0), 0); + Value *OnTrue = State.get(getOperand(1), 0); + Value *OnFalse = State.get(getOperand(2), 0); + Value *Pivot = State.get(getOperand(3), /* IsScalar */ true); assert(Pivot->getType()->isIntegerTy() && "Pivot should be an integer."); return Builder.CreateIntrinsic(Intrinsic::vp_merge, {OnTrue->getType()}, {Cond, OnTrue, OnFalse, Pivot}, nullptr, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ae10ebc8132ce..d8a7f35a4cb06 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1441,7 +1441,8 @@ void VPlanTransforms::addActiveLaneMask( } /// Replace recipes with their EVL variants. -static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { +static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL, + LLVMContext &Ctx) { using namespace llvm::VPlanPatternMatch; SmallVector HeaderMasks = collectAllHeaderMasks(Plan); VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); @@ -1481,15 +1482,14 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { TypeInfo.inferScalarType(Sel), Sel->getDebugLoc()); }) - .Case([&](VPInstruction *VPI) { + .Case([&](VPInstruction *VPI) -> VPRecipeBase * { VPValue *LHS, *RHS; if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS)))) return nullptr; - VPValue *Cond = Plan.getOrAddLiveIn(ConstantInt::getTrue( - CanonicalIVPHI->getScalarType()->getContext())); + VPValue *Cond = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx)); return new VPInstruction(VPInstruction::MergeUntilPivot, - {Cond, LHS, RHS, EVL}, + {Cond, LHS, RHS, &EVL}, VPI->getDebugLoc()); }) .Default([&](VPRecipeBase *R) { return nullptr; }); @@ -1605,7 +1605,8 @@ bool VPlanTransforms::tryAddExplicitVectorLength( NextEVLIV->insertBefore(CanonicalIVIncrement); EVLPhi->addOperand(NextEVLIV); - transformRecipestoEVLRecipes(Plan, *VPEVL); + LLVMContext &Ctx = CanonicalIVPHI->getScalarType()->getContext(); + transformRecipestoEVLRecipes(Plan, *VPEVL, Ctx); // Replace all uses of VPCanonicalIVPHIRecipe by // VPEVLBasedIVPHIRecipe except for the canonical IV increment. From 5636af897d8f7ea60829744f92025ef1c94321ec Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 3 Oct 2024 06:31:52 -0700 Subject: [PATCH 11/35] Updated VPlanVerifier --- .../Transforms/Vectorize/VPlanVerifier.cpp | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 8bdb313324358..a89fe04329102 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -156,22 +156,27 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { .Case( [&](const VPScalarCastRecipe *S) { return true; }) .Case([&](const VPInstruction *I) { - if (I->getOpcode() != Instruction::Add) { + switch (I->getOpcode()) { + default: errs() - << "EVL is used as an operand in non-VPInstruction::Add\n"; + << "EVL is used in VPInstructions that are neither " + "Instruction::Add nor VPInstruction::MergeUntilPivot\n"; return false; + case Instruction::Add: + if (I->getNumUsers() != 1) { + errs() << "EVL is used in VPInstruction:Add with multiple " + "users\n"; + return false; + } + if (!isa(*I->users().begin())) { + errs() << "Result of VPInstruction::Add with EVL operand is " + "not used by VPEVLBasedIVPHIRecipe\n"; + return false; + } + return true; + case VPInstruction::MergeUntilPivot: + return VerifyEVLUse(*I, 3); } - if (I->getNumUsers() != 1) { - errs() << "EVL is used in VPInstruction:Add with multiple " - "users\n"; - return false; - } - if (!isa(*I->users().begin())) { - errs() << "Result of VPInstruction::Add with EVL operand is " - "not used by VPEVLBasedIVPHIRecipe\n"; - return false; - } - return true; }) .Default([&](const VPUser *U) { errs() << "EVL has unexpected user\n"; From ca9e34aeca1c0ef889956ca2e8e604c8f6146be5 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 3 Oct 2024 06:32:33 -0700 Subject: [PATCH 12/35] Updated test case --- .../LoopVectorize/RISCV/inloop-reduction.ll | 150 ++--- ...rize-force-tail-with-evl-cond-reduction.ll | 4 +- ...-force-tail-with-evl-intermediate-store.ll | 34 +- ...vectorize-force-tail-with-evl-reduction.ll | 564 +++++++++--------- .../RISCV/vplan-vp-intrinsics-reduction.ll | 10 +- 5 files changed, 385 insertions(+), 377 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll index 17ee137e1d8e4..7ae58cb082428 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -136,39 +136,39 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; IF-EVL-OUTLOOP: vector.body: ; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]] -; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP5]], i32 4, i1 true) -; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = add i32 [[EVL_BASED_IV]], 0 -; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]] -; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0 -; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i16.p0(ptr align 2 [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) -; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = sext [[VP_OP_LOAD]] to -; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = add [[VEC_PHI]], [[TMP10]] -; IF-EVL-OUTLOOP-NEXT: [[TMP12]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP11]], [[VEC_PHI]], i32 [[TMP6]]) -; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true) +; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = add i32 [[EVL_BASED_IV]], 0 +; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP6]] +; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i16.p0(ptr align 2 [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP5]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = sext [[VP_OP_LOAD]] to +; IF-EVL-OUTLOOP-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[VEC_PHI]], [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP5]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP10]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[VP_OP]], [[VEC_PHI]], i32 [[TMP5]]) +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP5]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]] -; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: -; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP12]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP10]]) ; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; IF-EVL-OUTLOOP: scalar.ph: ; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-OUTLOOP: for.body: ; IF-EVL-OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IF-EVL-OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]] -; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP15]] to i32 +; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32 ; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] ; IF-EVL-OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-OUTLOOP: for.cond.cleanup.loopexit: -; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]] ; IF-EVL-OUTLOOP: for.cond.cleanup: ; IF-EVL-OUTLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -193,38 +193,38 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; IF-EVL-INLOOP: vector.body: ; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]] -; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP5]], i32 8, i1 true) -; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = add i32 [[EVL_BASED_IV]], 0 -; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]] -; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0 -; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) -; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = sext [[VP_OP_LOAD]] to -; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) -; IF-EVL-INLOOP-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]] -; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[AVL:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 8, i1 true) +; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = add i32 [[EVL_BASED_IV]], 0 +; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP6]] +; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0 +; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP5]]) +; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = sext [[VP_OP_LOAD]] to +; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP5]]) +; IF-EVL-INLOOP-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]] +; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP5]], [[EVL_BASED_IV]] ; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]] -; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL-INLOOP: middle.block: ; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; IF-EVL-INLOOP: scalar.ph: ; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-INLOOP: for.body: ; IF-EVL-INLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IF-EVL-INLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]] -; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; IF-EVL-INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; IF-EVL-INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32 ; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] ; IF-EVL-INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; IF-EVL-INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-INLOOP: for.cond.cleanup.loopexit: -; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; IF-EVL-INLOOP-NEXT: br label [[FOR_COND_CLEANUP]] ; IF-EVL-INLOOP: for.cond.cleanup: ; IF-EVL-INLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -264,12 +264,12 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; OUTLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; OUTLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; OUTLOOP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 -; OUTLOOP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; OUTLOOP: vector.body: ; OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; OUTLOOP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 ; OUTLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] ; OUTLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 @@ -364,46 +364,46 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; IF-EVL-OUTLOOP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL-OUTLOOP: vector.body: ; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) -; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-OUTLOOP-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) -; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = icmp slt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: -; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP16]]) +; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP15]]) ; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL-OUTLOOP: scalar.ph: ; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-OUTLOOP: for.body: ; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-OUTLOOP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP20]], [[RDX]] -; IF-EVL-OUTLOOP-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP20]], i32 [[RDX]] +; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-OUTLOOP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP19]], [[RDX]] +; IF-EVL-OUTLOOP-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP19]], i32 [[RDX]] ; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL-OUTLOOP: for.end: -; IF-EVL-OUTLOOP-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; IF-EVL-OUTLOOP-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: ret i32 [[SMIN_LCSSA]] ; ; IF-EVL-INLOOP-LABEL: @smin( @@ -427,19 +427,19 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) -; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.smin.nxv4i32(i32 2147483647, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-INLOOP-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP14]], i32 [[VEC_PHI]]) -; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = call i32 @llvm.vp.reduce.smin.nxv4i32(i32 2147483647, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; IF-EVL-INLOOP-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP13]], i32 [[VEC_PHI]]) +; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = zext i32 [[TMP9]] to i64 +; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]] ; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-INLOOP-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-INLOOP-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL-INLOOP: middle.block: ; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL-INLOOP: scalar.ph: @@ -450,9 +450,9 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ] ; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-INLOOP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP17]], [[RDX]] -; IF-EVL-INLOOP-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP17]], i32 [[RDX]] +; IF-EVL-INLOOP-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-INLOOP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP16]], [[RDX]] +; IF-EVL-INLOOP-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP16]], i32 [[RDX]] ; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 7c1a217326c3e..84e6bcc297fbd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -53,7 +53,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) ; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = select [[TMP18]], [[VP_OP_LOAD]], zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = add [[TMP19]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP19]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP20]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP21]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT1]] = add i64 [[TMP22]], [[EVL_BASED_IV1]] @@ -302,7 +302,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = add [[VEC_PHI]], [[VP_OP_LOAD]] +; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = call @llvm.vp.add.nxv4i32( [[VEC_PHI]], [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = xor [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = select [[TMP15]], [[TMP20]], zeroinitializer ; IF-EVL-OUTLOOP-NEXT: [[PREDPHI1:%.*]] = select [[TMP21]], [[VEC_PHI]], [[TMP19]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll index 7e764f528dd3e..f1213088f92fc 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll @@ -59,7 +59,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] ; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]), !alias.scope [[META0:![0-9]+]] -; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = add [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = call @llvm.vp.add.nxv4i32( [[VP_OP_LOAD]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP19]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP16]], [[VEC_PHI]], i32 [[TMP12]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64 ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] @@ -68,7 +68,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: ; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) -; IF-EVL-OUTLOOP-NEXT: store i32 [[TMP23]], ptr [[ADDR]], align 4 +; IF-EVL-OUTLOOP-NEXT: store i32 [[TMP23]], ptr [[ADDR]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] ; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL-OUTLOOP: scalar.ph: ; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] @@ -83,7 +83,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-OUTLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 ; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL-OUTLOOP: for.end: ; IF-EVL-OUTLOOP-NEXT: ret void ; @@ -131,7 +131,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-INLOOP-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-INLOOP: middle.block: -; IF-EVL-INLOOP-NEXT: store i32 [[TMP22]], ptr [[ADDR]], align 4 +; IF-EVL-INLOOP-NEXT: store i32 [[TMP22]], ptr [[ADDR]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] ; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL-INLOOP: scalar.ph: ; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] @@ -146,7 +146,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-INLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 ; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL-INLOOP: for.end: ; IF-EVL-INLOOP-NEXT: ret void ; @@ -187,7 +187,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; NO-VP-OUTLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; NO-VP-OUTLOOP: middle.block: ; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP12]]) -; NO-VP-OUTLOOP-NEXT: store i32 [[TMP14]], ptr [[ADDR]], align 4 +; NO-VP-OUTLOOP-NEXT: store i32 [[TMP14]], ptr [[ADDR]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] ; NO-VP-OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; NO-VP-OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; NO-VP-OUTLOOP: scalar.ph: @@ -203,7 +203,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; NO-VP-OUTLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 ; NO-VP-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; NO-VP-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; NO-VP-OUTLOOP: for.end: ; NO-VP-OUTLOOP-NEXT: ret void ; @@ -243,7 +243,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; NO-VP-INLOOP-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-VP-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; NO-VP-INLOOP: middle.block: -; NO-VP-INLOOP-NEXT: store i32 [[TMP12]], ptr [[ADDR]], align 4 +; NO-VP-INLOOP-NEXT: store i32 [[TMP12]], ptr [[ADDR]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] ; NO-VP-INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; NO-VP-INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; NO-VP-INLOOP: scalar.ph: @@ -259,7 +259,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; NO-VP-INLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 ; NO-VP-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; NO-VP-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; NO-VP-INLOOP: for.end: ; NO-VP-INLOOP-NEXT: ret void ; @@ -290,7 +290,9 @@ for.end: ; IF-EVL-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]} ; IF-EVL-OUTLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1} ; IF-EVL-OUTLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"} -; IF-EVL-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} +; IF-EVL-OUTLOOP: [[META6]] = !{[[META7:![0-9]+]]} +; IF-EVL-OUTLOOP: [[META7]] = distinct !{[[META7]], [[META2]]} +; IF-EVL-OUTLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]} ;. ; IF-EVL-INLOOP: [[META0]] = !{[[META1:![0-9]+]]} ; IF-EVL-INLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} @@ -298,7 +300,9 @@ for.end: ; IF-EVL-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]} ; IF-EVL-INLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1} ; IF-EVL-INLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"} -; IF-EVL-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} +; IF-EVL-INLOOP: [[META6]] = !{[[META7:![0-9]+]]} +; IF-EVL-INLOOP: [[META7]] = distinct !{[[META7]], [[META2]]} +; IF-EVL-INLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]} ;. ; NO-VP-OUTLOOP: [[META0]] = !{[[META1:![0-9]+]]} ; NO-VP-OUTLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} @@ -306,7 +310,9 @@ for.end: ; NO-VP-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]} ; NO-VP-OUTLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1} ; NO-VP-OUTLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"} -; NO-VP-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} +; NO-VP-OUTLOOP: [[META6]] = !{[[META7:![0-9]+]]} +; NO-VP-OUTLOOP: [[META7]] = distinct !{[[META7]], [[META2]]} +; NO-VP-OUTLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]} ;. ; NO-VP-INLOOP: [[META0]] = !{[[META1:![0-9]+]]} ; NO-VP-INLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} @@ -314,5 +320,7 @@ for.end: ; NO-VP-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]} ; NO-VP-INLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1} ; NO-VP-INLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"} -; NO-VP-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} +; NO-VP-INLOOP: [[META6]] = !{[[META7:![0-9]+]]} +; NO-VP-INLOOP: [[META7]] = distinct !{[[META7]], [[META2]]} +; NO-VP-INLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll index de4631a006535..5360fbb7c9bb3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll @@ -31,38 +31,38 @@ define i32 @add(ptr %a, i64 %n, i32 %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP15:%.*]] = add [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[VP_OP_LOAD]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[VP_OP]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP16]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[TMP20]], [[RDX]] +; IF-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[TMP18]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[ADD_LCSSA]] ; ; NO-VP-LABEL: @add( @@ -232,38 +232,38 @@ define i32 @or(ptr %a, i64 %n, i32 %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP15:%.*]] = or [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.or.nxv4i32( [[VP_OP_LOAD]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[VP_OP]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32( [[TMP16]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32( [[TMP14]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[OR]] = or i32 [[TMP20]], [[RDX]] +; IF-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[OR]] = or i32 [[TMP18]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[OR_LCSSA]] ; ; NO-VP-LABEL: @or( @@ -352,38 +352,38 @@ define i32 @and(ptr %a, i64 %n, i32 %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP15:%.*]] = and [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.and.nxv4i32( [[VP_OP_LOAD]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[VP_OP]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32( [[TMP16]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32( [[TMP14]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[AND]] = and i32 [[TMP20]], [[RDX]] +; IF-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[AND]] = and i32 [[TMP18]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[AND_LCSSA]] ; ; NO-VP-LABEL: @and( @@ -472,38 +472,38 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP15:%.*]] = xor [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.xor.nxv4i32( [[VP_OP_LOAD]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[VP_OP]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP16]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP14]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[XOR]] = xor i32 [[TMP20]], [[RDX]] +; IF-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[XOR]] = xor i32 [[TMP18]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[XOR_LCSSA]] ; ; NO-VP-LABEL: @xor( @@ -587,46 +587,46 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 -; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = icmp slt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP16]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP15]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP20]], [[RDX]] -; IF-EVL-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP20]], i32 [[RDX]] +; IF-EVL-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP19]], [[RDX]] +; IF-EVL-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP19]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[SMIN_LCSSA]] ; ; NO-VP-LABEL: @smin( @@ -714,46 +714,46 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 -; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp sgt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = icmp sgt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32( [[TMP16]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32( [[TMP15]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP20]], [[RDX]] -; IF-EVL-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP20]], i32 [[RDX]] +; IF-EVL-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP19]], [[RDX]] +; IF-EVL-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP19]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[SMAX_LCSSA]] ; ; NO-VP-LABEL: @smax( @@ -841,46 +841,46 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 -; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ult [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = icmp ult [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32( [[TMP16]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32( [[TMP15]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP20]], [[RDX]] -; IF-EVL-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP20]], i32 [[RDX]] +; IF-EVL-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP19]], [[RDX]] +; IF-EVL-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP19]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[UMIN_LCSSA]] ; ; NO-VP-LABEL: @umin( @@ -968,46 +968,46 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 -; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ugt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = icmp ugt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32( [[TMP16]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32( [[TMP15]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP20]], [[RDX]] -; IF-EVL-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP20]], i32 [[RDX]] +; IF-EVL-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP19]], [[RDX]] +; IF-EVL-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP19]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[UMAX_LCSSA]] ; ; NO-VP-LABEL: @umax( @@ -1100,38 +1100,38 @@ define float @fadd(ptr %a, i64 %n, float %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP15:%.*]] = fadd reassoc [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[VP_OP:%.*]] = call reassoc @llvm.vp.fadd.nxv4f32( [[VP_OP_LOAD]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[VP_OP]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP19:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP16]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP14]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[ADD]] = fadd reassoc float [[TMP20]], [[RDX]] +; IF-EVL-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD]] = fadd reassoc float [[TMP18]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret float [[ADD_LCSSA]] ; ; NO-VP-LABEL: @fadd( @@ -1296,46 +1296,46 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, float [[START:%.*]], i64 0 -; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[START:%.*]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast olt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = fcmp fast olt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32( [[TMP16]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32( [[TMP15]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP20]], [[RDX]] -; IF-EVL-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP20]], float [[RDX]] +; IF-EVL-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP19]], [[RDX]] +; IF-EVL-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP19]], float [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret float [[MIN_LCSSA]] ; ; NO-VP-LABEL: @fmin( @@ -1423,46 +1423,46 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, float [[START:%.*]], i64 0 -; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[START:%.*]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast ogt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP14]], [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = fcmp fast ogt [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32( [[TMP16]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32( [[TMP15]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP20]], [[RDX]] -; IF-EVL-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP20]], float [[RDX]] +; IF-EVL-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP19]], [[RDX]] +; IF-EVL-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP19]], float [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret float [[MAX_LCSSA]] ; ; NO-VP-LABEL: @fmax( @@ -1717,43 +1717,43 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP12]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = call reassoc @llvm.fmuladd.nxv4f32( [[VP_OP_LOAD]], [[VP_OP_LOAD1]], [[VEC_PHI]]) -; IF-EVL-NEXT: [[TMP18]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP17]], [[VEC_PHI]], i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call reassoc @llvm.fmuladd.nxv4f32( [[VP_OP_LOAD]], [[VP_OP_LOAD1]], [[VEC_PHI]]) +; IF-EVL-NEXT: [[TMP17]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP16]], [[VEC_PHI]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP21:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP18]]) +; IF-EVL-NEXT: [[TMP20:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP17]]) ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP22]], float [[TMP23]], float [[RDX]]) +; IF-EVL-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP21]], float [[TMP22]], float [[RDX]]) ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret float [[MULADD_LCSSA]] ; ; NO-VP-LABEL: @fmuladd( @@ -1848,25 +1848,25 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP15:%.*]] = or [[VEC_PHI]], [[TMP14]] -; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP14:%.*]] = or [[VEC_PHI]], [[TMP13]] +; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP16]]) -; IF-EVL-NEXT: [[TMP20:%.*]] = freeze i1 [[TMP19]] -; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 [[INV:%.*]], i32 [[START:%.*]] +; IF-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP15]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = freeze i1 [[TMP18]] +; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP19]], i32 [[INV:%.*]], i32 [[START:%.*]] ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -1876,8 +1876,8 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP21]], 3 +; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP20]], 3 ; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] @@ -1975,25 +1975,25 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast olt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP15:%.*]] = or [[VEC_PHI]], [[TMP14]] -; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP15]], [[VEC_PHI]], i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = fcmp fast olt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP14:%.*]] = or [[VEC_PHI]], [[TMP13]] +; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP16]]) -; IF-EVL-NEXT: [[TMP20:%.*]] = freeze i1 [[TMP19]] -; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 [[INV:%.*]], i32 [[START:%.*]] +; IF-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP15]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = freeze i1 [[TMP18]] +; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP19]], i32 [[INV:%.*]], i32 [[START:%.*]] ; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -2003,8 +2003,8 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP21]], 3.000000e+00 +; IF-EVL-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP20]], 3.000000e+00 ; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 23788c941190a..ca830972bf696 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -37,12 +37,13 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-OUTLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]> ; IF-EVL-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, vp<[[RDX_SELECT:%.+]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%n> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%n>, vp<[[EVL_PHI]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-OUTLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> ; IF-EVL-OUTLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> ; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD1]]>, ir<[[RDX_PHI]]> +; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[ADD:%.+]]> = vp.add ir<[[LD1]]>, ir<[[RDX_PHI]]>, vp<[[EVL]]> ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_SELECT]]> = merge-until-pivot ir, ir<[[ADD]]>, ir<[[RDX_PHI]]>, vp<[[EVL]]> ; IF-EVL-OUTLOOP-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> @@ -54,17 +55,16 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-EMPTY: ; IF-EVL-OUTLOOP-NEXT: middle.block: ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_EXTRACT:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> ; IF-EVL-OUTLOOP-NEXT: EMIT branch-on-cond ir ; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph ; IF-EVL-OUTLOOP-EMPTY: ; IF-EVL-OUTLOOP-NEXT: ir-bb: +; IF-EVL-OUTLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]>) ; IF-EVL-OUTLOOP-NEXT: No successors ; IF-EVL-OUTLOOP-EMPTY: ; IF-EVL-OUTLOOP-NEXT: scalar.ph: ; IF-EVL-OUTLOOP-NEXT: No successors -; IF-EVL-OUTLOOP-EMPTY: -; IF-EVL-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EXTRACT]]> ; IF-EVL-OUTLOOP-NEXT: } ; From 921931a1ea2b738278e9e9a0aa1de6fcbda358d8 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 3 Oct 2024 07:29:02 -0700 Subject: [PATCH 13/35] Updated VPlanAnalysis --- llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 8b8ab6be99b0d..44468b3e5c1bc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -50,6 +50,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return SetResultTyFromOp(); switch (Opcode) { + case VPInstruction::MergeUntilPivot: case Instruction::Select: { Type *ResTy = inferScalarType(R->getOperand(1)); VPValue *OtherV = R->getOperand(2); From c6a389d4e9a9c0f0ef2ac4099f89d86a799b2c33 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 7 Oct 2024 02:22:29 -0700 Subject: [PATCH 14/35] Move Ctx into transformRecipestoEVLRecipes --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d8a7f35a4cb06..7bb86411ea620 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1441,9 +1441,9 @@ void VPlanTransforms::addActiveLaneMask( } /// Replace recipes with their EVL variants. -static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL, - LLVMContext &Ctx) { +static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { using namespace llvm::VPlanPatternMatch; + LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext(); SmallVector HeaderMasks = collectAllHeaderMasks(Plan); VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { @@ -1605,8 +1605,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength( NextEVLIV->insertBefore(CanonicalIVIncrement); EVLPhi->addOperand(NextEVLIV); - LLVMContext &Ctx = CanonicalIVPHI->getScalarType()->getContext(); - transformRecipestoEVLRecipes(Plan, *VPEVL, Ctx); + transformRecipestoEVLRecipes(Plan, *VPEVL); // Replace all uses of VPCanonicalIVPHIRecipe by // VPEVLBasedIVPHIRecipe except for the canonical IV increment. From a52d3dec4bbd496048c3d352bc7ceb4a3e15df6e Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 7 Oct 2024 02:37:35 -0700 Subject: [PATCH 15/35] Comment for all true condition. nfc --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 7bb86411ea620..28810a4a59e7b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1487,9 +1487,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS)))) return nullptr; - VPValue *Cond = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx)); + // Use all true as the condition because this transformation is + // limited to selects whose condition is a header mask. + VPValue *AllTrue = + Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx)); return new VPInstruction(VPInstruction::MergeUntilPivot, - {Cond, LHS, RHS, &EVL}, + {AllTrue, LHS, RHS, &EVL}, VPI->getDebugLoc()); }) .Default([&](VPRecipeBase *R) { return nullptr; }); From 9f4f2b858f88304cfd5e501a152af499a0c8120a Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 7 Oct 2024 02:58:48 -0700 Subject: [PATCH 16/35] Remove unnecessary code --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index d45298846d963..6c994c9aeeaf6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -638,9 +638,9 @@ Value *VPInstruction::generate(VPTransformState &State) { return NewPhi; } case VPInstruction::MergeUntilPivot: { - Value *Cond = State.get(getOperand(0), 0); - Value *OnTrue = State.get(getOperand(1), 0); - Value *OnFalse = State.get(getOperand(2), 0); + Value *Cond = State.get(getOperand(0)); + Value *OnTrue = State.get(getOperand(1)); + Value *OnFalse = State.get(getOperand(2)); Value *Pivot = State.get(getOperand(3), /* IsScalar */ true); assert(Pivot->getType()->isIntegerTy() && "Pivot should be an integer."); return Builder.CreateIntrinsic(Intrinsic::vp_merge, {OnTrue->getType()}, From 31c3e707add67a8b732759b1cce51382dd490212 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 14 Oct 2024 07:44:58 -0700 Subject: [PATCH 17/35] NEED_FIX: Requires cost of vp.merge in RISCV TTI --- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 046f63fe1617c..ea2e77d9b24db 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1202,6 +1202,8 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, *FOp, ICA.getArgTypes()[0], UI->getPointerAlignment(), UI->getOperand(1)->getType()->getPointerAddressSpace(), CostKind); } + case Intrinsic::vp_merge: + return 1; case Intrinsic::vp_select: { Intrinsic::ID IID = ICA.getID(); std::optional FOp = VPIntrinsic::getFunctionalOpcodeForVP(IID); From efebb1db9377b343f38d5b7f5c58f087ddcc90ad Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 14 Oct 2024 07:47:37 -0700 Subject: [PATCH 18/35] Emit VPWidenIntrinsicRecipe --- llvm/lib/Transforms/Vectorize/VPlan.h | 15 +++++++++++++++ llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 10 ++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 97146c1f0c12b..764c2ad1cf458 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1677,6 +1677,21 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags { !Attrs.hasFnAttr(Attribute::WillReturn); } + VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, + std::initializer_list CallArguments, + Type *Ty, DebugLoc DL = {}) + : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments), + VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) { + LLVMContext &Ctx = Ty->getContext(); + AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID); + MemoryEffects ME = Attrs.getMemoryEffects(); + MayReadFromMemory = ME.onlyWritesMemory(); + MayWriteToMemory = ME.onlyReadsMemory(); + MayHaveSideEffects = MayWriteToMemory || + Attrs.hasFnAttr(Attribute::NoUnwind) || + !Attrs.hasFnAttr(Attribute::WillReturn); + } + ~VPWidenIntrinsicRecipe() override = default; VPWidenIntrinsicRecipe *clone() override { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 28810a4a59e7b..f45b3cd8f5069 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1443,7 +1443,9 @@ void VPlanTransforms::addActiveLaneMask( /// Replace recipes with their EVL variants. static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { using namespace llvm::VPlanPatternMatch; - LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext(); + Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); + VPTypeAnalysis TypeInfo(CanonicalIVType); + LLVMContext &Ctx = CanonicalIVType->getContext(); SmallVector HeaderMasks = collectAllHeaderMasks(Plan); VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { @@ -1491,9 +1493,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { // limited to selects whose condition is a header mask. VPValue *AllTrue = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx)); - return new VPInstruction(VPInstruction::MergeUntilPivot, - {AllTrue, LHS, RHS, &EVL}, - VPI->getDebugLoc()); + return new VPWidenIntrinsicRecipe( + Intrinsic::vp_merge, {AllTrue, LHS, RHS, &EVL}, + TypeInfo.inferScalarType(LHS), VPI->getDebugLoc()); }) .Default([&](VPRecipeBase *R) { return nullptr; }); From 7cce6585707caf831f6c8de4b3096d468e3f00ee Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 14 Oct 2024 07:48:32 -0700 Subject: [PATCH 19/35] Update VPlanVerify --- llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index a89fe04329102..3e38fcc88354e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -178,6 +178,10 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { return VerifyEVLUse(*I, 3); } }) + .Case( + [&](const VPWidenIntrinsicRecipe *I) { + return VerifyEVLUse(*I, I->getNumOperands() - 1); + }) .Default([&](const VPUser *U) { errs() << "EVL has unexpected user\n"; return false; From d88b12b4f2dac937e4e67671ca270a4d0298c002 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 14 Oct 2024 07:52:10 -0700 Subject: [PATCH 20/35] Update test case --- .../LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index ca830972bf696..0ba69928aa37d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -44,7 +44,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> ; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> ; IF-EVL-OUTLOOP-NEXT: WIDEN ir<[[ADD:%.+]]> = vp.add ir<[[LD1]]>, ir<[[RDX_PHI]]>, vp<[[EVL]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_SELECT]]> = merge-until-pivot ir, ir<[[ADD]]>, ir<[[RDX_PHI]]>, vp<[[EVL]]> +; IF-EVL-OUTLOOP-NEXT: WIDEN-INTRINSIC vp<[[RDX_SELECT]]> = call llvm.vp.merge(ir, ir<[[ADD]]>, ir<[[RDX_PHI]]>, vp<[[EVL]]>) ; IF-EVL-OUTLOOP-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> From 36c857ba2adf9b2e7802eaf6be1d868fc59d94de Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Fri, 18 Oct 2024 00:18:14 -0700 Subject: [PATCH 21/35] Revert "NEED_FIX: Requires cost of vp.merge in RISCV TTI" This reverts commit 817d7611b08d59652f63a36c94ffcd9a3dde9c02. --- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index ea2e77d9b24db..046f63fe1617c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1202,8 +1202,6 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, *FOp, ICA.getArgTypes()[0], UI->getPointerAlignment(), UI->getOperand(1)->getType()->getPointerAddressSpace(), CostKind); } - case Intrinsic::vp_merge: - return 1; case Intrinsic::vp_select: { Intrinsic::ID IID = ICA.getID(); std::optional FOp = VPIntrinsic::getFunctionalOpcodeForVP(IID); From bc20d2a5b07a860dd4a2bd85d16c34ae29ccc1c4 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Fri, 18 Oct 2024 00:19:57 -0700 Subject: [PATCH 22/35] Revert "Update VPlanVerify" This reverts commit 51235cfdb22112334919c2c89a5a84b3cea50f68. --- llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 3e38fcc88354e..a89fe04329102 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -178,10 +178,6 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { return VerifyEVLUse(*I, 3); } }) - .Case( - [&](const VPWidenIntrinsicRecipe *I) { - return VerifyEVLUse(*I, I->getNumOperands() - 1); - }) .Default([&](const VPUser *U) { errs() << "EVL has unexpected user\n"; return false; From ba74ffd525aaabf5f11a7b4a73deefa05d944782 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Fri, 18 Oct 2024 01:22:58 -0700 Subject: [PATCH 23/35] Rebase and update test cases --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 1 - .../LoopVectorize/RISCV/inloop-reduction.ll | 2 +- .../vectorize-force-tail-with-evl-cond-reduction.ll | 2 +- .../RISCV/vectorize-force-tail-with-evl-reduction.ll | 12 ++++++------ .../RISCV/vplan-vp-intrinsics-reduction.ll | 4 ++-- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f45b3cd8f5069..9c627457a7d9a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1447,7 +1447,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPTypeAnalysis TypeInfo(CanonicalIVType); LLVMContext &Ctx = CanonicalIVType->getContext(); SmallVector HeaderMasks = collectAllHeaderMasks(Plan); - VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { for (VPUser *U : collectUsersRecursively(HeaderMask)) { auto *CurRecipe = cast(U); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll index 7ae58cb082428..8ca2bd1f286ae 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -378,7 +378,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = icmp slt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = call @llvm.vp.select.nxv4i32( [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 84e6bcc297fbd..e433899fb10b6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -52,7 +52,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = select [[TMP18]], [[VP_OP_LOAD]], zeroinitializer +; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = call @llvm.vp.select.nxv4i32( [[TMP18]], [[VP_OP_LOAD]], zeroinitializer, i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP19]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP20]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP21]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = zext i32 [[TMP11]] to i64 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll index 5360fbb7c9bb3..e594fce5a7a79 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll @@ -601,7 +601,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = icmp slt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP14:%.*]] = call @llvm.vp.select.nxv4i32( [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] @@ -728,7 +728,7 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = icmp sgt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP14:%.*]] = call @llvm.vp.select.nxv4i32( [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] @@ -855,7 +855,7 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = icmp ult [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP14:%.*]] = call @llvm.vp.select.nxv4i32( [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] @@ -982,7 +982,7 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = icmp ugt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP14:%.*]] = call @llvm.vp.select.nxv4i32( [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] @@ -1310,7 +1310,7 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = fcmp fast olt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP14:%.*]] = call @llvm.vp.select.nxv4f32( [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] @@ -1437,7 +1437,7 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = fcmp fast ogt [[VP_OP_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP14:%.*]] = select [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP14:%.*]] = call @llvm.vp.select.nxv4f32( [[TMP13]], [[VP_OP_LOAD]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP15]] = call @llvm.vp.merge.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP14]], [[VEC_PHI]], i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 0ba69928aa37d..aeecfa56f3294 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -35,7 +35,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: vector loop: { ; IF-EVL-OUTLOOP-NEXT: vector.body: ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-OUTLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]> +; IF-EVL-OUTLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, vp<[[RDX_SELECT:%.+]]> ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%n>, vp<[[EVL_PHI]]> ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> @@ -47,7 +47,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: WIDEN-INTRINSIC vp<[[RDX_SELECT]]> = call llvm.vp.merge(ir, ir<[[ADD]]>, ir<[[RDX_PHI]]>, vp<[[EVL]]>) ; IF-EVL-OUTLOOP-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> ; IF-EVL-OUTLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> ; IF-EVL-OUTLOOP-NEXT: No successors ; IF-EVL-OUTLOOP-NEXT: } From 02475a0d40faafa88dbae057d72da1d256f0ff4a Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Fri, 18 Oct 2024 01:33:40 -0700 Subject: [PATCH 24/35] Revert "Updated VPlanVerifier" This reverts commit a4cacc7e48d284d350098e74586bbfbee9ae058c. --- .../Transforms/Vectorize/VPlanVerifier.cpp | 31 ++++++++----------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index a89fe04329102..8bdb313324358 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -156,27 +156,22 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { .Case( [&](const VPScalarCastRecipe *S) { return true; }) .Case([&](const VPInstruction *I) { - switch (I->getOpcode()) { - default: + if (I->getOpcode() != Instruction::Add) { errs() - << "EVL is used in VPInstructions that are neither " - "Instruction::Add nor VPInstruction::MergeUntilPivot\n"; + << "EVL is used as an operand in non-VPInstruction::Add\n"; return false; - case Instruction::Add: - if (I->getNumUsers() != 1) { - errs() << "EVL is used in VPInstruction:Add with multiple " - "users\n"; - return false; - } - if (!isa(*I->users().begin())) { - errs() << "Result of VPInstruction::Add with EVL operand is " - "not used by VPEVLBasedIVPHIRecipe\n"; - return false; - } - return true; - case VPInstruction::MergeUntilPivot: - return VerifyEVLUse(*I, 3); } + if (I->getNumUsers() != 1) { + errs() << "EVL is used in VPInstruction:Add with multiple " + "users\n"; + return false; + } + if (!isa(*I->users().begin())) { + errs() << "Result of VPInstruction::Add with EVL operand is " + "not used by VPEVLBasedIVPHIRecipe\n"; + return false; + } + return true; }) .Default([&](const VPUser *U) { errs() << "EVL has unexpected user\n"; From 5b2ee05244752ed40b71a633d623fffb0d632e07 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Fri, 18 Oct 2024 01:34:19 -0700 Subject: [PATCH 25/35] Revert "Updated VPlanAnalysis" This reverts commit 05349558c8ea19b8224dc9b2473bb6dcbd7f0f70. --- llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 44468b3e5c1bc..8b8ab6be99b0d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -50,7 +50,6 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return SetResultTyFromOp(); switch (Opcode) { - case VPInstruction::MergeUntilPivot: case Instruction::Select: { Type *ResTy = inferScalarType(R->getOperand(1)); VPValue *OtherV = R->getOperand(2); From f35fbe6c1133baf2b01ce108088856a4212d919e Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Fri, 18 Oct 2024 01:55:54 -0700 Subject: [PATCH 26/35] Remove VPInstruction::MergeUntilPivot --- llvm/lib/Transforms/Vectorize/VPlan.h | 5 ----- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 19 +------------------ 2 files changed, 1 insertion(+), 23 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 764c2ad1cf458..9b6f26c5ccac9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1230,11 +1230,6 @@ class VPInstruction : public VPRecipeWithIRFlags, // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). PtrAdd, - // Selects elements from two vectors (second and third operand) based on a - // condition vector (first operand) and a pivot index (fourth operand). The - // lanes whose positions are greater than or equal to the pivot are taken - // from the third operand. - MergeUntilPivot, }; private: diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6c994c9aeeaf6..6254ea1519181 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -64,7 +64,6 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::PtrAdd: - case VPInstruction::MergeUntilPivot: return false; default: return true; @@ -637,16 +636,7 @@ Value *VPInstruction::generate(VPTransformState &State) { } return NewPhi; } - case VPInstruction::MergeUntilPivot: { - Value *Cond = State.get(getOperand(0)); - Value *OnTrue = State.get(getOperand(1)); - Value *OnFalse = State.get(getOperand(2)); - Value *Pivot = State.get(getOperand(3), /* IsScalar */ true); - assert(Pivot->getType()->isIntegerTy() && "Pivot should be an integer."); - return Builder.CreateIntrinsic(Intrinsic::vp_merge, {OnTrue->getType()}, - {Cond, OnTrue, OnFalse, Pivot}, nullptr, - Name); - } + default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -728,9 +718,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case VPInstruction::BranchOnCond: case VPInstruction::ResumePhi: return true; - case VPInstruction::MergeUntilPivot: - // Pivot must be an integer. - return Op == getOperand(3); }; llvm_unreachable("switch should return"); } @@ -749,7 +736,6 @@ bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { case VPInstruction::BranchOnCount: case VPInstruction::BranchOnCond: case VPInstruction::CanonicalIVIncrementForPart: - case VPInstruction::MergeUntilPivot: return true; }; llvm_unreachable("switch should return"); @@ -816,9 +802,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::PtrAdd: O << "ptradd"; break; - case VPInstruction::MergeUntilPivot: - O << "merge-until-pivot"; - break; default: O << Instruction::getOpcodeName(getOpcode()); } From 744580695cd98f390c9c8aaa8ed53311a7833210 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Fri, 18 Oct 2024 02:21:46 -0700 Subject: [PATCH 27/35] Remove irrelevant updates --- ...-force-tail-with-evl-intermediate-store.ll | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll index f1213088f92fc..80ed71571f7c4 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll @@ -131,7 +131,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-INLOOP-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-INLOOP: middle.block: -; IF-EVL-INLOOP-NEXT: store i32 [[TMP22]], ptr [[ADDR]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] +; IF-EVL-INLOOP-NEXT: store i32 [[TMP22]], ptr [[ADDR]], align 4 ; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL-INLOOP: scalar.ph: ; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] @@ -146,7 +146,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-INLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 ; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL-INLOOP: for.end: ; IF-EVL-INLOOP-NEXT: ret void ; @@ -187,7 +187,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; NO-VP-OUTLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; NO-VP-OUTLOOP: middle.block: ; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP12]]) -; NO-VP-OUTLOOP-NEXT: store i32 [[TMP14]], ptr [[ADDR]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] +; NO-VP-OUTLOOP-NEXT: store i32 [[TMP14]], ptr [[ADDR]], align 4 ; NO-VP-OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; NO-VP-OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; NO-VP-OUTLOOP: scalar.ph: @@ -203,7 +203,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; NO-VP-OUTLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 ; NO-VP-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; NO-VP-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; NO-VP-OUTLOOP: for.end: ; NO-VP-OUTLOOP-NEXT: ret void ; @@ -243,7 +243,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; NO-VP-INLOOP-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-VP-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; NO-VP-INLOOP: middle.block: -; NO-VP-INLOOP-NEXT: store i32 [[TMP12]], ptr [[ADDR]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] +; NO-VP-INLOOP-NEXT: store i32 [[TMP12]], ptr [[ADDR]], align 4 ; NO-VP-INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; NO-VP-INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; NO-VP-INLOOP: scalar.ph: @@ -259,7 +259,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; NO-VP-INLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 ; NO-VP-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; NO-VP-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; NO-VP-INLOOP: for.end: ; NO-VP-INLOOP-NEXT: ret void ; @@ -300,9 +300,7 @@ for.end: ; IF-EVL-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]} ; IF-EVL-INLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1} ; IF-EVL-INLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"} -; IF-EVL-INLOOP: [[META6]] = !{[[META7:![0-9]+]]} -; IF-EVL-INLOOP: [[META7]] = distinct !{[[META7]], [[META2]]} -; IF-EVL-INLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]} +; IF-EVL-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} ;. ; NO-VP-OUTLOOP: [[META0]] = !{[[META1:![0-9]+]]} ; NO-VP-OUTLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} @@ -310,9 +308,7 @@ for.end: ; NO-VP-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]} ; NO-VP-OUTLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1} ; NO-VP-OUTLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"} -; NO-VP-OUTLOOP: [[META6]] = !{[[META7:![0-9]+]]} -; NO-VP-OUTLOOP: [[META7]] = distinct !{[[META7]], [[META2]]} -; NO-VP-OUTLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]} +; NO-VP-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} ;. ; NO-VP-INLOOP: [[META0]] = !{[[META1:![0-9]+]]} ; NO-VP-INLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} @@ -320,7 +316,5 @@ for.end: ; NO-VP-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]} ; NO-VP-INLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1} ; NO-VP-INLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"} -; NO-VP-INLOOP: [[META6]] = !{[[META7:![0-9]+]]} -; NO-VP-INLOOP: [[META7]] = distinct !{[[META7]], [[META2]]} -; NO-VP-INLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]} +; NO-VP-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} ;. From 2f9c25660695a24c21c54a2ef76b21054c053924 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 22 Oct 2024 01:42:10 -0700 Subject: [PATCH 28/35] Rebase --- .../Transforms/Vectorize/VPlanPatternMatch.h | 28 ------------------- 1 file changed, 28 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 838e5479a2e5a..38f82aa33dc1a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -178,11 +178,6 @@ struct Recipe_match { static_assert(std::tuple_size::value == 2 && "constructor can only be used for binary matcher"); } - template - Recipe_match(A_t A, B_t B, C_t C) : Ops({A, B, C}) { - static_assert(std::tuple_size::value == 3 && - "constructor can only be used for ternary matcher"); - } bool match(const VPValue *V) const { auto *DefR = V->getDefiningRecipe(); @@ -240,16 +235,6 @@ using AllBinaryRecipe_match = BinaryRecipe_match; -template -using TernaryRecipe_match = Recipe_match, - Opcode, Commutative, RecipeTys...>; - -template -using TernaryVPInstruction_match = - TernaryRecipe_match; - template inline UnaryVPInstruction_match m_VPInstruction(const Op0_t &Op0) { @@ -262,12 +247,6 @@ m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1) { return BinaryVPInstruction_match(Op0, Op1); } -template -inline TernaryVPInstruction_match -m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { - return TernaryVPInstruction_match(Op0, Op1, Op2); -} - template inline UnaryVPInstruction_match m_Not(const Op0_t &Op0) { @@ -404,13 +383,6 @@ inline VPScalarIVSteps_match m_ScalarIVSteps(const Op0_t &Op0, return VPScalarIVSteps_match(Op0, Op1); } -template -inline TernaryVPInstruction_match -m_Select(const Op0_t &Cond, const Op1_t &LHS, const Op2_t &RHS) { - return m_VPInstruction(Cond, LHS, - RHS); -} - } // namespace VPlanPatternMatch } // namespace llvm From 908250cb4458886a90d129d9480b36ddf33c7a83 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 22 Oct 2024 02:47:36 -0700 Subject: [PATCH 29/35] doc the code. --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9c627457a7d9a..70cad4bee0ed8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1485,6 +1485,10 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { }) .Case([&](VPInstruction *VPI) -> VPRecipeBase * { VPValue *LHS, *RHS; + // Transform predicated reduction select + // select(header_mask, LHS, RHS) + // into vector predication merge. + // vp.merge(all-true, LHS, RHS, EVL) if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS)))) return nullptr; From d020ce0bc655c8fb50b0130471df775010fcf8e6 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Fri, 25 Oct 2024 05:38:05 -0700 Subject: [PATCH 30/35] drop the blank line --- llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 38f82aa33dc1a..18e5e2996c828 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -382,7 +382,6 @@ inline VPScalarIVSteps_match m_ScalarIVSteps(const Op0_t &Op0, const Op1_t &Op1) { return VPScalarIVSteps_match(Op0, Op1); } - } // namespace VPlanPatternMatch } // namespace llvm From 012079e9b3ca8d6980b7dbfb4d2e2a4d088f2847 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 28 Oct 2024 06:34:55 -0700 Subject: [PATCH 31/35] Refine constructor of VPWidenIntrinsicRecipe --- llvm/lib/Transforms/Vectorize/VPlan.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9b6f26c5ccac9..18f5f13073aa6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1675,17 +1675,8 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags { VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, std::initializer_list CallArguments, Type *Ty, DebugLoc DL = {}) - : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments), - VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) { - LLVMContext &Ctx = Ty->getContext(); - AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID); - MemoryEffects ME = Attrs.getMemoryEffects(); - MayReadFromMemory = ME.onlyWritesMemory(); - MayWriteToMemory = ME.onlyReadsMemory(); - MayHaveSideEffects = MayWriteToMemory || - Attrs.hasFnAttr(Attribute::NoUnwind) || - !Attrs.hasFnAttr(Attribute::WillReturn); - } + : VPWidenIntrinsicRecipe(VectorIntrinsicID, + ArrayRef(CallArguments), Ty, DL) {} ~VPWidenIntrinsicRecipe() override = default; From f5ad37e4b30e8f2a7605771f02053ea00aea45c9 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 29 Oct 2024 02:13:30 -0700 Subject: [PATCH 32/35] Refine the comments according to Florian's comment --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4eed377a51e5e..e31531f16f862 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1484,7 +1484,8 @@ class LoopVectorizationCostModel { return InLoopReductions.contains(Phi); } - /// Returns true if the predicated reduction select should be used. + /// Returns true if the predicated reduction select should be used to set the + /// incoming value for the reduction phi. bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const { // Force to use predicated reduction select since the EVL of the // second-to-last iteration might not be VF*UF. From 8d9daf666b80e328b6501c15878d8c7799f792ef Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 30 Oct 2024 01:13:57 -0700 Subject: [PATCH 33/35] Updated test case --- .../LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index aeecfa56f3294..4e12e78bb2ba3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -64,7 +64,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: No successors ; IF-EVL-OUTLOOP-EMPTY: ; IF-EVL-OUTLOOP-NEXT: scalar.ph: +; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> ; IF-EVL-OUTLOOP-NEXT: No successors +; IF-EVL-OUTLOOP-EMPTY: +; IF-EVL-OUTLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> ; IF-EVL-OUTLOOP-NEXT: } ; From e8cb377bbe3501101db8baf0013b345a6af2d8f9 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 5 Nov 2024 00:13:21 -0800 Subject: [PATCH 34/35] Rebase and update test case --- .../LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 4e12e78bb2ba3..414f39d557044 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -65,9 +65,13 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-EMPTY: ; IF-EVL-OUTLOOP-NEXT: scalar.ph: ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> -; IF-EVL-OUTLOOP-NEXT: No successors +; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb ; IF-EVL-OUTLOOP-EMPTY: -; IF-EVL-OUTLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> +; IF-EVL-OUTLOOP-NEXT: ir-bb: +; IF-EVL-OUTLOOP-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; IF-EVL-OUTLOOP-NEXT: IR %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] +; IF-EVL-OUTLOOP: IR %exitcond.not = icmp eq i64 %iv.next, %n +; IF-EVL-OUTLOOP-NEXT: No successors ; IF-EVL-OUTLOOP-NEXT: } ; From 53d9ac6b0aa6d890e82cdffd53cd61faa025f772 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 5 Nov 2024 00:29:41 -0800 Subject: [PATCH 35/35] Updated comment --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 70cad4bee0ed8..ea8845eaa75d4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1485,7 +1485,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { }) .Case([&](VPInstruction *VPI) -> VPRecipeBase * { VPValue *LHS, *RHS; - // Transform predicated reduction select + // Transform select with a header mask condition // select(header_mask, LHS, RHS) // into vector predication merge. // vp.merge(all-true, LHS, RHS, EVL)