From 0b07987f452c801ce48996da95dc001972a8d95e Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 1 Apr 2025 22:28:09 +0100 Subject: [PATCH 1/7] [VPlan] Simplify VPBlendRecipes to select instructions When looking at some EVL tail folded code in SPEC CPU 2017 I noticed we sometimes have both VPBlendRecipes and select VPInstructions in the same plan: EMIT vp<%active.lane.mask> = active lane mask vp<%5>, vp<%3> EMIT vp<%7> = icmp ... EMIT vp<%8> = logical-and vp<%active.lane.mask>, vp<%7> BLEND ir<%8> = ir<%n.015> ir<%foo>/vp<%8> EMIT vp<%9> = select vp<%active.lane.mask>, ir<%8>, ir<%n.015> Since a blend will ultimately generate a chain of selects, we could fold the blend into the select: EMIT vp<%active.lane.mask> = active lane mask vp<%5>, vp<%3> EMIT vp<%7> = icmp ... EMIT vp<%8> = logical-and vp<%active.lane.mask>, vp<%7> EMIT ir<%8> = select vp<%8>, ir<%foo>, ir<%n.015> So this patch canonicalizes blends to a series of select instructions, which allows them to be simplified further with other select instructions. The `BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask` optimisation has also been converted to operate on selects. --- llvm/lib/Transforms/Vectorize/VPlan.h | 4 -- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 61 +++++-------------- .../Transforms/Vectorize/VPlanTransforms.cpp | 44 +++++-------- .../AArch64/masked-call-scalarize.ll | 4 +- .../LoopVectorize/AArch64/masked-call.ll | 4 +- .../RISCV/blocks-with-dead-instructions.ll | 4 +- .../Transforms/LoopVectorize/RISCV/divrem.ll | 24 ++++---- ...rize-force-tail-with-evl-cond-reduction.ll | 8 +-- .../LoopVectorize/X86/load-deref-pred.ll | 16 ++--- .../X86/replicate-uniform-call.ll | 55 +---------------- .../Transforms/LoopVectorize/if-conversion.ll | 4 +- .../LoopVectorize/load-deref-pred-align.ll | 4 +- .../pr55167-fold-tail-live-out.ll | 3 +- .../LoopVectorize/single-value-blend-phis.ll | 14 ++--- 14 files changed, 73 insertions(+), 176 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 61b5ccd85bc6e..2ba5affa2c82d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2297,10 +2297,6 @@ class VPBlendRecipe : public VPSingleDefRecipe { /// Generate the phi/select nodes. void execute(VPTransformState &State) override; - /// Return the cost of this VPWidenMemoryRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1a38932ef99fe..9acd52b2ec02c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -909,6 +909,19 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ctx.CostKind); } + case Instruction::Select: { + if (!getUnderlyingValue()) + return 0; + // Handle cases where only the first lane is used the same way as the legacy + // cost model. + if (vputils::onlyFirstLaneUsed(this)) + return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); + Type *ResTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); + return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResTy, CmpTy, + CmpInst::BAD_ICMP_PREDICATE, + Ctx.CostKind); + } case VPInstruction::AnyOf: { auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); return Ctx.TTI.getArithmeticReductionCost( @@ -2380,53 +2393,7 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPBlendRecipe::execute(VPTransformState &State) { - assert(isNormalized() && "Expected blend to be normalized!"); - // We know that all PHIs in non-header blocks are converted into - // selects, so we don't have to worry about the insertion order and we - // can just use the builder. - // At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - unsigned NumIncoming = getNumIncomingValues(); - - // Generate a sequence of selects of the form: - // SELECT(Mask3, In3, - // SELECT(Mask2, In2, - // SELECT(Mask1, In1, - // In0))) - // Note that Mask0 is never used: lanes for which no path reaches this phi and - // are essentially undef are taken from In0. - bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *Result = nullptr; - for (unsigned In = 0; In < NumIncoming; ++In) { - // We might have single edge PHIs (blocks) - use an identity - // 'select' for the first PHI operand. - Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed); - if (In == 0) - Result = In0; // Initialize with the first incoming value. - else { - // Select between the current value and the previous incoming edge - // based on the incoming mask. - Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed); - Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi"); - } - } - State.set(this, Result, OnlyFirstLaneUsed); -} - -InstructionCost VPBlendRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - // Handle cases where only the first lane is used the same way as the legacy - // cost model. - if (vputils::onlyFirstLaneUsed(this)) - return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); - - Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); - Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); - return (getNumIncomingValues() - 1) * - Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy, - CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind); + llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends"); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 8d4a73c744469..ee6856674871e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1082,6 +1082,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) return Def->replaceAllUsesWith(X); + // select !c, x, y -> select c, y, x + VPValue *C; + if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) { + Def->setOperand(0, C); + Def->setOperand(1, Y); + Def->setOperand(2, X); + return; + } + if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) return Def->replaceAllUsesWith(A); @@ -1288,38 +1297,17 @@ static void simplifyBlends(VPlan &Plan) { } } - SmallVector OperandsWithMask; - OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); - + VPBuilder Builder(&R); + VPValue *Select = Blend->getIncomingValue(StartIndex); for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { if (I == StartIndex) continue; - OperandsWithMask.push_back(Blend->getIncomingValue(I)); - OperandsWithMask.push_back(Blend->getMask(I)); - } - - auto *NewBlend = new VPBlendRecipe( - cast(Blend->getUnderlyingValue()), OperandsWithMask); - NewBlend->insertBefore(&R); - - VPValue *DeadMask = Blend->getMask(StartIndex); - Blend->replaceAllUsesWith(NewBlend); - Blend->eraseFromParent(); - recursivelyDeleteDeadRecipes(DeadMask); - - /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask. - VPValue *NewMask; - if (NewBlend->getNumOperands() == 3 && - match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) { - VPValue *Inc0 = NewBlend->getOperand(0); - VPValue *Inc1 = NewBlend->getOperand(1); - VPValue *OldMask = NewBlend->getOperand(2); - NewBlend->setOperand(0, Inc1); - NewBlend->setOperand(1, Inc0); - NewBlend->setOperand(2, NewMask); - if (OldMask->getNumUsers() == 0) - cast(OldMask)->eraseFromParent(); + Select = + Builder.createSelect(Blend->getMask(I), Blend->getIncomingValue(I), + Select, R.getDebugLoc(), "predphi"); + Select->setUnderlyingValue(Blend->getUnderlyingValue()); } + Blend->replaceAllUsesWith(Select); } } } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll index 078f98f54525b..c507c36f0dba9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll @@ -22,8 +22,8 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0 ; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer ; TFNONE-NEXT: [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]]) -; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer -; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) +; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ule <2 x double> [[TMP2]], zeroinitializer +; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer ; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1 ; TFNONE-NEXT: store double [[TMP14]], ptr [[P:%.*]], align 8 ; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 6029095bbe7b1..b9b54aeca7f00 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -935,8 +935,8 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[TMP7]], i64 0 ; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TFNONE-NEXT: [[TMP8:%.*]] = call @exp_masked_scalable( [[BROADCAST_SPLAT]], splat (i1 true)) -; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ogt [[TMP8]], zeroinitializer -; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], zeroinitializer, splat (double 1.000000e+00) +; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ule [[TMP8]], zeroinitializer +; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], splat (double 1.000000e+00), zeroinitializer ; TFNONE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() ; TFNONE-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 2 ; TFNONE-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll index f8b83ff41f512..d5e07a615057d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll @@ -597,8 +597,8 @@ define void @empty_block_with_phi_1(ptr %src, i64 %N) #0 { ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], splat (i16 99), [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP8]], [[WIDE_LOAD]], splat (i16 99) ; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP11]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP9]], [[TMP5]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll index db780c3c12c7e..0451d30179a0a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -677,9 +677,9 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne [[WIDE_LOAD]], splat (i64 42) +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq [[WIDE_LOAD]], splat (i64 42) ; CHECK-NEXT: [[TMP10:%.*]] = udiv [[WIDE_LOAD]], splat (i64 27) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP10]], [[WIDE_LOAD]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[WIDE_LOAD]], [[TMP10]] ; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -720,12 +720,12 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4 ; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 -; FIXED-NEXT: [[TMP4:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], splat (i64 42) -; FIXED-NEXT: [[TMP5:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], splat (i64 42) +; FIXED-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42) +; FIXED-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD1]], splat (i64 42) ; FIXED-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], splat (i64 27) ; FIXED-NEXT: [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], splat (i64 27) -; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[WIDE_LOAD]], <4 x i64> [[TMP6]] +; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[WIDE_LOAD1]], <4 x i64> [[TMP7]] ; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8 ; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP3]], align 8 ; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 @@ -797,9 +797,9 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne [[WIDE_LOAD]], splat (i64 42) +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq [[WIDE_LOAD]], splat (i64 42) ; CHECK-NEXT: [[TMP10:%.*]] = sdiv [[WIDE_LOAD]], splat (i64 27) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP10]], [[WIDE_LOAD]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[WIDE_LOAD]], [[TMP10]] ; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -840,12 +840,12 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4 ; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 -; FIXED-NEXT: [[TMP4:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], splat (i64 42) -; FIXED-NEXT: [[TMP5:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], splat (i64 42) +; FIXED-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42) +; FIXED-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD1]], splat (i64 42) ; FIXED-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], splat (i64 27) ; FIXED-NEXT: [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], splat (i64 27) -; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[WIDE_LOAD]], <4 x i64> [[TMP6]] +; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[WIDE_LOAD1]], <4 x i64> [[TMP7]] ; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8 ; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP3]], align 8 ; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 8c44da63e08a6..fbc3c0dc84219 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -398,9 +398,9 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; NO-VP-OUTLOOP-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 ; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load , ptr [[TMP21]], align 4 -; NO-VP-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt [[WIDE_MASKED_LOAD]], splat (i32 3) +; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = icmp sle [[WIDE_MASKED_LOAD]], splat (i32 3) ; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] -; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select [[TMP18]], [[TMP16]], [[VEC_PHI]] +; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select [[TMP12]], [[VEC_PHI]], [[TMP16]] ; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; NO-VP-OUTLOOP-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-VP-OUTLOOP-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -956,9 +956,9 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; NO-VP-OUTLOOP-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0 ; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load , ptr [[TMP27]], align 4 -; NO-VP-OUTLOOP-NEXT: [[TMP28:%.*]] = icmp sgt [[WIDE_MASKED_LOAD]], [[VEC_IND]] +; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = icmp sle [[WIDE_MASKED_LOAD]], [[VEC_IND]] ; NO-VP-OUTLOOP-NEXT: [[TMP22:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] -; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select [[TMP28]], [[TMP22]], [[VEC_PHI]] +; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select [[TMP13]], [[VEC_PHI]], [[TMP22]] ; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; NO-VP-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; NO-VP-OUTLOOP-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll index f4cd48de60243..7e66fcf8568d3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -34,10 +34,10 @@ define i32 @test_explicit_pred(i64 %len) { ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4) ; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], splat (i64 4) -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp sge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp sge <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp sge <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4 @@ -47,10 +47,10 @@ define i32 @test_explicit_pred(i64 %len) { ; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 ; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 ; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4 -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[WIDE_LOAD8]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[WIDE_LOAD9]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD7]] +; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD8]] +; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD9]] ; CHECK-NEXT: [[TMP16]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[TMP17]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]] ; CHECK-NEXT: [[TMP18]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll index ed1c67c082134..5ad5cf14aea10 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll @@ -11,54 +11,9 @@ define void @smax_call_uniform(ptr %dst, i64 %x) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[C:%.*]] = icmp ult i8 -68, -69 ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[X]], 0 -; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true) -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]] -; CHECK: [[PRED_UREM_IF]]: -; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE]] -; CHECK: [[PRED_UREM_CONTINUE]]: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]] -; CHECK: [[PRED_UREM_IF1]]: -; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE2]] -; CHECK: [[PRED_UREM_CONTINUE2]]: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_UREM_IF3:.*]], label %[[PRED_UREM_CONTINUE4:.*]] -; CHECK: [[PRED_UREM_IF3]]: -; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE4]] -; CHECK: [[PRED_UREM_CONTINUE4]]: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UREM_IF5:.*]], label %[[PRED_UREM_CONTINUE6]] -; CHECK: [[PRED_UREM_IF5]]: -; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE6]] -; CHECK: [[PRED_UREM_CONTINUE6]]: -; CHECK-NEXT: [[TMP12:%.*]] = tail call i64 @llvm.smax.i64(i64 0, i64 0) -; CHECK-NEXT: [[TMP13:%.*]] = tail call i64 @llvm.smax.i64(i64 0, i64 0) -; CHECK-NEXT: [[P:%.*]] = select i1 [[C]], i64 1, i64 [[TMP12]] -; CHECK-NEXT: [[PREDPHI7:%.*]] = select i1 [[C]], i64 1, i64 [[TMP13]] -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[P]], 1 -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[PREDPHI7]], 1 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[DST]], i64 [[ADD]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP17]] -; CHECK-NEXT: store i64 0, ptr [[GEP]], align 8 -; CHECK-NEXT: store i64 0, ptr [[TMP19]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 [[C]], label %[[LOOP_LATCH]], label %[[ELSE:.*]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: [[REM1:%.*]] = urem i64 [[MUL]], [[X]] @@ -71,7 +26,7 @@ define void @smax_call_uniform(ptr %dst, i64 %x) { ; CHECK-NEXT: store i64 0, ptr [[GEP1]], align 8 ; CHECK-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT1]], 1024 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -103,9 +58,3 @@ exit: } declare i64 @llvm.smax.i64(i64, i64) -;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -;. diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion.ll b/llvm/test/Transforms/LoopVectorize/if-conversion.ll index 8a7f4a386fda1..57109a0996a66 100644 --- a/llvm/test/Transforms/LoopVectorize/if-conversion.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion.ll @@ -159,10 +159,10 @@ define i32 @reduction_func(ptr nocapture %A, i32 %n) nounwind uwtable readonly s ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 30) +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], splat (i32 31) ; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[VEC_PHI]], splat (i32 2) ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[WIDE_LOAD]] -; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index 8a326c9d0c083..7c6bc6aa6ca3a 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -567,7 +567,7 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <2 x i8> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]] @@ -577,7 +577,7 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 2 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i16> poison, i16 [[TMP9]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> [[TMP11]], i16 [[TMP10]], i32 1 -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP12]], <2 x i16> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> zeroinitializer, <2 x i16> [[TMP12]] ; CHECK-NEXT: [[TMP13]] = add <2 x i16> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll index bd9647188911a..1eb6b134621da 100644 --- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll +++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll @@ -23,8 +23,7 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 10) ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[TMP0]], splat (i32 20) ; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> splat (i32 9), <2 x i32> splat (i32 9) -; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_IND]], <2 x i32> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_IND]], <2 x i32> splat (i32 9) ; CHECK-NEXT: [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[TMP3]] ; CHECK-NEXT: [[PREDPHI7]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_PHI]], <2 x i32> [[PREDPHI6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll index d5a206ff21da0..9bc231108fa96 100644 --- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll @@ -21,8 +21,8 @@ define void @single_incoming_phi_no_blend_mask(i64 %a, i64 %b) { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i16> splat (i16 1), <2 x i16> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i16> [[WIDE_LOAD]], <2 x i16> splat (i16 1) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0 ; CHECK-NEXT: store <2 x i16> [[PREDPHI]], ptr [[TMP7]], align 2 @@ -107,9 +107,8 @@ define void @single_incoming_phi_with_blend_mask(i64 %a, i64 %b) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i16> [[WIDE_LOAD]], <2 x i16> splat (i16 1) -; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP8]], <2 x i16> zeroinitializer, <2 x i16> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[PREDPHI]], <2 x i16> zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0 ; CHECK-NEXT: store <2 x i16> [[PREDPHI1]], ptr [[TMP10]], align 2 @@ -189,8 +188,8 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <2 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <2 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[VEC_IND3]], <2 x i16> [[VEC_IND1]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP0]], <2 x i16> [[VEC_IND1]], <2 x i16> [[VEC_IND3]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1 @@ -297,9 +296,8 @@ define void @single_incoming_needs_predication(i64 %a, i64 %b) { ; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[TMP15:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP16]], <2 x i16> [[TMP14]], <2 x i16> splat (i16 1) -; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP17]], <2 x i16> zeroinitializer, <2 x i16> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> [[PREDPHI]], <2 x i16> zeroinitializer ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0 ; CHECK-NEXT: store <2 x i16> [[PREDPHI3]], ptr [[TMP19]], align 2 From bb56e679be84f63f28f5ac0e837e1d0719442dcf Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 2 Jul 2025 00:46:06 +0100 Subject: [PATCH 2/7] Update VPlan debug output tests --- llvm/test/Transforms/LoopVectorize/vplan-printing.ll | 6 +++--- .../LoopVectorize/vplan-sink-scalars-and-merge.ll | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 732214aa1449e..94ee454e971e8 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -187,7 +187,7 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) { ; CHECK-NEXT: Successor(s): if.then.0 ; CHECK-EMPTY: ; CHECK-NEXT: if.then.0: -; CHECK-NEXT: BLEND ir<%d> = ir<0> vp<[[PRED]]>/ir<%cmp> +; CHECK-NEXT: EMIT ir<%d> = select ir<%cmp>, vp<[[PRED]]>, ir<0> ; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%d> @@ -376,7 +376,7 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db ; CHECK-NEXT: if.then.0: ; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not ir<%cmp2> ; CHECK-NEXT: EMIT vp<[[SEL2:%.+]]> = logical-and vp<[[NOT1]]>, vp<[[NOT2]]> -; CHECK-NEXT: BLEND ir<%ysd.0> = vp<[[PHI]]> ir<%psd>/vp<[[SEL2]]> +; CHECK-NEXT: EMIT ir<%ysd.0> = select vp<[[SEL2]]>, ir<%psd>, vp<[[PHI]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%isd> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%ysd.0> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> @@ -757,7 +757,7 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) { ; CHECK-EMPTY: ; CHECK-NEXT: if.then.1: ; CHECK-NEXT: WIDEN ir<%fadd> = fadd vp<[[PHI1]]>, vp<[[PHI2]]> -; CHECK-NEXT: BLEND ir<%st.value> = ir<%ld.value> ir<%fadd>/ir<%ifcond> +; CHECK-NEXT: EMIT ir<%st.value> = select ir<%ifcond>, ir<%fadd>, ir<%ld.value> ; CHECK-NEXT: CLONE ir<%st.addr> = getelementptr inbounds ir<%dest>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%st.addr> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%st.value> diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 4a4bda254bf88..a330afc45abab 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -366,7 +366,7 @@ define void @pred_cfg1(i32 %k, i32 %j) { ; CHECK-NEXT: Successor(s): then.0.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0.0: -; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]> +; CHECK-NEXT: EMIT ir<%p> = select vp<[[MASK2]]>, vp<[[PRED]]>, ir<0> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -465,7 +465,7 @@ define void @pred_cfg2(i32 %k, i32 %j) { ; CHECK-NEXT: Successor(s): then.0.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0.0: -; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]> +; CHECK-NEXT: EMIT ir<%p> = select vp<[[MASK2]]>, vp<[[PRED]]>, ir<0> ; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.1> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: @@ -571,7 +571,7 @@ define void @pred_cfg3(i32 %k, i32 %j) { ; CHECK-NEXT: Successor(s): then.0.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0.0: -; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]> +; CHECK-NEXT: EMIT ir<%p> = select vp<[[MASK2]]>, vp<[[PRED]]>, ir<0> ; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.0> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: From e0a2fbd634e7fbaa785d5a6ac7d44a344908ce2b Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Fri, 4 Jul 2025 12:59:46 +0100 Subject: [PATCH 3/7] Mark in planContainsAdditionalSimplifications This fixes a crash on SPEC CPU 2017 with mismatching legacy + vplan cost models. I'm really struggling to create a test case for this due to the issue at https://github.com/llvm/llvm-project/issues/147038 --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 67df7a8af098d..ee6728d846203 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7048,6 +7048,13 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, if (isa(&R)) return true; + // VPBlendRecipes are converted to selects and may have been simplified. + using namespace VPlanPatternMatch; + if (match(&R, m_VPInstruction( + m_VPValue(), m_VPValue(), m_VPValue())) && + isa_and_nonnull(R.getVPSingleValue()->getUnderlyingValue())) + return true; + /// If a VPlan transform folded a recipe to one producing a single-scalar, /// but the original instruction wasn't uniform-after-vectorization in the /// legacy cost model, the legacy cost overestimates the actual cost. From 4911b73a3262ba31b9c868c53ce021b4af3e936f Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 8 Jul 2025 17:56:49 +0800 Subject: [PATCH 4/7] Use getUnderlyingInstr --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fcf22b0266349..7c107467743ea 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7051,7 +7051,7 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, using namespace VPlanPatternMatch; if (match(&R, m_VPInstruction( m_VPValue(), m_VPValue(), m_VPValue())) && - isa_and_nonnull(R.getVPSingleValue()->getUnderlyingValue())) + isa_and_nonnull(cast(R).getUnderlyingInstr())) return true; /// If a VPlan transform folded a recipe to one producing a single-scalar, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index d3f4810d6efec..53bda7b2419d3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -929,7 +929,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, Ctx.CostKind); } case Instruction::Select: { - if (!getUnderlyingValue()) + if (!getUnderlyingInstr()) return 0; // Handle cases where only the first lane is used the same way as the legacy // cost model. From 78c3a086c64c974519847a53ba83a27eb253360d Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 8 Jul 2025 18:03:53 +0800 Subject: [PATCH 5/7] Revert "Use getUnderlyingInstr" This reverts commit 4911b73a3262ba31b9c868c53ce021b4af3e936f. Turns out we can't use getUnderlyingInstr since it does a cast<> which asserts on null --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7c107467743ea..fcf22b0266349 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7051,7 +7051,7 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, using namespace VPlanPatternMatch; if (match(&R, m_VPInstruction( m_VPValue(), m_VPValue(), m_VPValue())) && - isa_and_nonnull(cast(R).getUnderlyingInstr())) + isa_and_nonnull(R.getVPSingleValue()->getUnderlyingValue())) return true; /// If a VPlan transform folded a recipe to one producing a single-scalar, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 53bda7b2419d3..d3f4810d6efec 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -929,7 +929,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, Ctx.CostKind); } case Instruction::Select: { - if (!getUnderlyingInstr()) + if (!getUnderlyingValue()) return 0; // Handle cases where only the first lane is used the same way as the legacy // cost model. From a3c03405cee01f3214fb69f97c9ddc4e634d1bd2 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 9 Jul 2025 13:41:58 +0800 Subject: [PATCH 6/7] Precisely track blend selects to see if they have been simplified, add test case --- .../Transforms/Vectorize/LoopVectorize.cpp | 15 +++- .../LoopVectorize/RISCV/blend-simplified.ll | 81 +++++++++++++++++++ 2 files changed, 93 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/blend-simplified.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fcf22b0266349..55290edac90b1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7020,6 +7020,7 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, }; DenseSet SeenInstrs; + SmallDenseMap BlendPhis; auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { for (VPRecipeBase &R : *VPBB) { @@ -7048,11 +7049,13 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, return true; // VPBlendRecipes are converted to selects and may have been simplified. + // Keep track of how many selects each phi has been converted to. using namespace VPlanPatternMatch; if (match(&R, m_VPInstruction( - m_VPValue(), m_VPValue(), m_VPValue())) && - isa_and_nonnull(R.getVPSingleValue()->getUnderlyingValue())) - return true; + m_VPValue(), m_VPValue(), m_VPValue()))) + if (auto *Phi = dyn_cast_if_present( + R.getVPSingleValue()->getUnderlyingValue())) + BlendPhis[Phi]++; /// If a VPlan transform folded a recipe to one producing a single-scalar, /// but the original instruction wasn't uniform-after-vectorization in the @@ -7077,6 +7080,12 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, } } + // If a phi has been simplified then it will have less selects than the number + // of incoming values. + for (auto [Phi, NumSelects] : BlendPhis) + if (NumSelects != Phi->getNumIncomingValues() - 1) + return true; + // Return true if the loop contains any instructions that are not also part of // the VPlan or are skipped for VPlan-based cost computations. This indicates // that the VPlan contains extra simplifications. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blend-simplified.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blend-simplified.ll new file mode 100644 index 0000000000000..2ade5502d0019 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/blend-simplified.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 +; RUN: opt < %s -S -p loop-vectorize | FileCheck %s + +; VPlanTransforms::simplifyRecipes will simplify some of selects stemming from +; the blend recipe, which will cause a difference between the legacy and VPlan +; based cost models. Make sure we account for this simplifcation. This is +; extracted from sqlite3 in llvm-test-suite. + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-gnu" + +define i64 @html_encode() #0 { +; CHECK-LABEL: define i64 @html_encode( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[BROADCAST_SPLAT]], splat (i8 38) +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i64> zeroinitializer, <2 x i64> splat (i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i64> zeroinitializer, [[PREDPHI]] +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: br i1 true, label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[SW_BB6:.*]]: +; CHECK-NEXT: br label %[[FOR_INC:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[J_031:%.*]] = phi i64 [ [[INC12:%.*]], %[[FOR_INC]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: switch i8 [[TMP4]], label %[[FOR_INC]] [ +; CHECK-NEXT: i8 62, label %[[FOR_INC]] +; CHECK-NEXT: i8 1, label %[[FOR_INC]] +; CHECK-NEXT: i8 38, label %[[SW_BB6]] +; CHECK-NEXT: i8 0, label %[[FOR_INC]] +; CHECK-NEXT: ] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[DOTSINK:%.*]] = phi i64 [ 0, %[[SW_BB6]] ], [ 1, %[[FOR_BODY]] ], [ 1, %[[FOR_BODY]] ], [ 1, %[[FOR_BODY]] ], [ 1, %[[FOR_BODY]] ] +; CHECK-NEXT: [[INC:%.*]] = or i64 0, [[DOTSINK]] +; CHECK-NEXT: [[INC12]] = add i64 [[J_031]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[J_031]], 1 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i64 [ [[INC]], %[[FOR_INC]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[INC_LCSSA]] +; +entry: + br label %for.body + +sw.bb6: ; preds = %for.body + br label %for.inc + +for.body: ; preds = %for.inc, %entry + %j.031 = phi i64 [ %inc12, %for.inc ], [ 0, %entry ] + %0 = load i8, ptr null, align 1 + switch i8 %0, label %for.inc [ + i8 62, label %for.inc + i8 1, label %for.inc + i8 38, label %sw.bb6 + i8 0, label %for.inc + ] + +for.inc: ; preds = %for.body, %for.body, %for.body, %for.body, %sw.bb6 + %.sink = phi i64 [ 0, %sw.bb6 ], [ 1, %for.body ], [ 1, %for.body ], [ 1, %for.body ], [ 1, %for.body ] + %inc = or i64 0, %.sink + %inc12 = add i64 %j.031, 1 + %exitcond.not = icmp eq i64 %j.031, 1 + br i1 %exitcond.not, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.inc + ret i64 %inc +} + +attributes #0 = { "target-features"="+v" } From b39d6f55e7d6f608190a8377375cb0e4c2c0c211 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 15 Jul 2025 15:38:33 +0800 Subject: [PATCH 7/7] Don't simplify, just expand in convertToConcreteRecipes --- .../Transforms/Vectorize/LoopVectorize.cpp | 16 ---- llvm/lib/Transforms/Vectorize/VPlan.h | 4 + .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 27 ++++--- .../Transforms/Vectorize/VPlanTransforms.cpp | 50 ++++++++++-- .../AArch64/masked-call-scalarize.ll | 4 +- .../LoopVectorize/AArch64/masked-call.ll | 4 +- .../LoopVectorize/RISCV/blend-simplified.ll | 81 ------------------- .../RISCV/blocks-with-dead-instructions.ll | 4 +- .../Transforms/LoopVectorize/RISCV/divrem.ll | 24 +++--- ...rize-force-tail-with-evl-cond-reduction.ll | 8 +- .../LoopVectorize/X86/load-deref-pred.ll | 16 ++-- .../X86/replicate-uniform-call.ll | 55 ++++++++++++- .../Transforms/LoopVectorize/if-conversion.ll | 4 +- .../LoopVectorize/load-deref-pred-align.ll | 4 +- .../pr55167-fold-tail-live-out.ll | 3 +- .../LoopVectorize/single-value-blend-phis.ll | 14 ++-- .../LoopVectorize/vplan-printing.ll | 6 +- .../vplan-sink-scalars-and-merge.ll | 6 +- 18 files changed, 163 insertions(+), 167 deletions(-) delete mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/blend-simplified.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 934b1db32e585..5380a0fc6498a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6909,7 +6909,6 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, }; DenseSet SeenInstrs; - SmallDenseMap BlendPhis; auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { for (VPRecipeBase &R : *VPBB) { @@ -6937,15 +6936,6 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, if (isa(&R)) return true; - // VPBlendRecipes are converted to selects and may have been simplified. - // Keep track of how many selects each phi has been converted to. - using namespace VPlanPatternMatch; - if (match(&R, m_VPInstruction( - m_VPValue(), m_VPValue(), m_VPValue()))) - if (auto *Phi = dyn_cast_if_present( - R.getVPSingleValue()->getUnderlyingValue())) - BlendPhis[Phi]++; - /// If a VPlan transform folded a recipe to one producing a single-scalar, /// but the original instruction wasn't uniform-after-vectorization in the /// legacy cost model, the legacy cost overestimates the actual cost. @@ -6969,12 +6959,6 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, } } - // If a phi has been simplified then it will have less selects than the number - // of incoming values. - for (auto [Phi, NumSelects] : BlendPhis) - if (NumSelects != Phi->getNumIncomingValues() - 1) - return true; - // Return true if the loop contains any instructions that are not also part of // the VPlan or are skipped for VPlan-based cost computations. This indicates // that the VPlan contains extra simplifications. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 12f7142ebad7f..85741b977bb77 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2306,6 +2306,10 @@ class LLVM_ABI_FOR_TEST VPBlendRecipe : public VPSingleDefRecipe { /// Generate the phi/select nodes. void execute(VPTransformState &State) override; + /// Return the cost of this VPWidenMemoryRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index a05b0f2f15e16..86f5bc6c33d34 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -924,19 +924,6 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ctx.CostKind); } - case Instruction::Select: { - if (!getUnderlyingValue()) - return 0; - // Handle cases where only the first lane is used the same way as the legacy - // cost model. - if (vputils::onlyFirstLaneUsed(this)) - return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); - Type *ResTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); - Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); - return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResTy, CmpTy, - CmpInst::BAD_ICMP_PREDICATE, - Ctx.CostKind); - } case VPInstruction::AnyOf: { auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); return Ctx.TTI.getArithmeticReductionCost( @@ -2414,6 +2401,20 @@ void VPBlendRecipe::execute(VPTransformState &State) { llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends"); } +InstructionCost VPBlendRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + // Handle cases where only the first lane is used the same way as the legacy + // cost model. + if (vputils::onlyFirstLaneUsed(this)) + return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); + + Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); + return (getNumIncomingValues() - 1) * + Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy, + CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e62e1c9968e02..bf5a6750f0fd6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1297,17 +1297,38 @@ static void simplifyBlends(VPlan &Plan) { } } - VPBuilder Builder(&R); - VPValue *Select = Blend->getIncomingValue(StartIndex); + SmallVector OperandsWithMask; + OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); + for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { if (I == StartIndex) continue; - Select = - Builder.createSelect(Blend->getMask(I), Blend->getIncomingValue(I), - Select, R.getDebugLoc(), "predphi"); - Select->setUnderlyingValue(Blend->getUnderlyingValue()); + OperandsWithMask.push_back(Blend->getIncomingValue(I)); + OperandsWithMask.push_back(Blend->getMask(I)); + } + + auto *NewBlend = new VPBlendRecipe( + cast(Blend->getUnderlyingValue()), OperandsWithMask); + NewBlend->insertBefore(&R); + + VPValue *DeadMask = Blend->getMask(StartIndex); + Blend->replaceAllUsesWith(NewBlend); + Blend->eraseFromParent(); + recursivelyDeleteDeadRecipes(DeadMask); + + /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask. + VPValue *NewMask; + if (NewBlend->getNumOperands() == 3 && + match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) { + VPValue *Inc0 = NewBlend->getOperand(0); + VPValue *Inc1 = NewBlend->getOperand(1); + VPValue *OldMask = NewBlend->getOperand(2); + NewBlend->setOperand(0, Inc1); + NewBlend->setOperand(1, Inc0); + NewBlend->setOperand(2, NewMask); + if (OldMask->getNumUsers() == 0) + cast(OldMask)->eraseFromParent(); } - Blend->replaceAllUsesWith(Select); } } } @@ -2690,6 +2711,20 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, continue; } + // Expand VPBlendRecipe into VPInstruction::Select + VPBuilder Builder(&R); + if (auto *Blend = dyn_cast(&R)) { + VPValue *Select = Blend->getIncomingValue(0); + for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) { + Select = Builder.createSelect(Blend->getMask(I), + Blend->getIncomingValue(I), Select, + R.getDebugLoc(), "predphi"); + Select->setUnderlyingValue(Blend->getUnderlyingValue()); + } + Blend->replaceAllUsesWith(Select); + ToRemove.push_back(Blend); + } + if (auto *Expr = dyn_cast(&R)) { Expr->decompose(); ToRemove.push_back(Expr); @@ -2703,7 +2738,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, // Expand WideIVStep. auto *VPI = cast(&R); - VPBuilder Builder(VPI); Type *IVTy = TypeInfo.inferScalarType(VPI); if (TypeInfo.inferScalarType(VectorStep) != IVTy) { Instruction::CastOps CastOp = IVTy->isFloatingPointTy() diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll index c507c36f0dba9..078f98f54525b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll @@ -22,8 +22,8 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0 ; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer ; TFNONE-NEXT: [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]]) -; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ule <2 x double> [[TMP2]], zeroinitializer -; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer +; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer +; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) ; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1 ; TFNONE-NEXT: store double [[TMP14]], ptr [[P:%.*]], align 8 ; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index b9b54aeca7f00..6029095bbe7b1 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -935,8 +935,8 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[TMP7]], i64 0 ; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TFNONE-NEXT: [[TMP8:%.*]] = call @exp_masked_scalable( [[BROADCAST_SPLAT]], splat (i1 true)) -; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ule [[TMP8]], zeroinitializer -; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], splat (double 1.000000e+00), zeroinitializer +; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ogt [[TMP8]], zeroinitializer +; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], zeroinitializer, splat (double 1.000000e+00) ; TFNONE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() ; TFNONE-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 2 ; TFNONE-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blend-simplified.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blend-simplified.ll deleted file mode 100644 index 2ade5502d0019..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/RISCV/blend-simplified.ll +++ /dev/null @@ -1,81 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 -; RUN: opt < %s -S -p loop-vectorize | FileCheck %s - -; VPlanTransforms::simplifyRecipes will simplify some of selects stemming from -; the blend recipe, which will cause a difference between the legacy and VPlan -; based cost models. Make sure we account for this simplifcation. This is -; extracted from sqlite3 in llvm-test-suite. - -target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" -target triple = "riscv64-unknown-linux-gnu" - -define i64 @html_encode() #0 { -; CHECK-LABEL: define i64 @html_encode( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[TMP0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[BROADCAST_SPLAT]], splat (i8 38) -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i64> zeroinitializer, <2 x i64> splat (i64 1) -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i64> zeroinitializer, [[PREDPHI]] -; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: br i1 true, label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[SW_BB6:.*]]: -; CHECK-NEXT: br label %[[FOR_INC:.*]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[J_031:%.*]] = phi i64 [ [[INC12:%.*]], %[[FOR_INC]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: switch i8 [[TMP4]], label %[[FOR_INC]] [ -; CHECK-NEXT: i8 62, label %[[FOR_INC]] -; CHECK-NEXT: i8 1, label %[[FOR_INC]] -; CHECK-NEXT: i8 38, label %[[SW_BB6]] -; CHECK-NEXT: i8 0, label %[[FOR_INC]] -; CHECK-NEXT: ] -; CHECK: [[FOR_INC]]: -; CHECK-NEXT: [[DOTSINK:%.*]] = phi i64 [ 0, %[[SW_BB6]] ], [ 1, %[[FOR_BODY]] ], [ 1, %[[FOR_BODY]] ], [ 1, %[[FOR_BODY]] ], [ 1, %[[FOR_BODY]] ] -; CHECK-NEXT: [[INC:%.*]] = or i64 0, [[DOTSINK]] -; CHECK-NEXT: [[INC12]] = add i64 [[J_031]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[J_031]], 1 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: [[FOR_END_LOOPEXIT]]: -; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i64 [ [[INC]], %[[FOR_INC]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i64 [[INC_LCSSA]] -; -entry: - br label %for.body - -sw.bb6: ; preds = %for.body - br label %for.inc - -for.body: ; preds = %for.inc, %entry - %j.031 = phi i64 [ %inc12, %for.inc ], [ 0, %entry ] - %0 = load i8, ptr null, align 1 - switch i8 %0, label %for.inc [ - i8 62, label %for.inc - i8 1, label %for.inc - i8 38, label %sw.bb6 - i8 0, label %for.inc - ] - -for.inc: ; preds = %for.body, %for.body, %for.body, %for.body, %sw.bb6 - %.sink = phi i64 [ 0, %sw.bb6 ], [ 1, %for.body ], [ 1, %for.body ], [ 1, %for.body ], [ 1, %for.body ] - %inc = or i64 0, %.sink - %inc12 = add i64 %j.031, 1 - %exitcond.not = icmp eq i64 %j.031, 1 - br i1 %exitcond.not, label %for.end.loopexit, label %for.body - -for.end.loopexit: ; preds = %for.inc - ret i64 %inc -} - -attributes #0 = { "target-features"="+v" } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll index d5e07a615057d..f8b83ff41f512 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll @@ -597,8 +597,8 @@ define void @empty_block_with_phi_1(ptr %src, i64 %N) #0 { ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP8]], [[WIDE_LOAD]], splat (i16 99) +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], splat (i16 99), [[WIDE_LOAD]] ; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP11]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP9]], [[TMP5]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll index 2a492d257b7b8..c6661cb644d80 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -677,9 +677,9 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq [[WIDE_LOAD]], splat (i64 42) +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne [[WIDE_LOAD]], splat (i64 42) ; CHECK-NEXT: [[TMP10:%.*]] = udiv [[WIDE_LOAD]], splat (i64 27) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[WIDE_LOAD]], [[TMP10]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP10]], [[WIDE_LOAD]] ; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -720,12 +720,12 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4 ; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42) -; FIXED-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD1]], splat (i64 42) +; FIXED-NEXT: [[TMP4:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], splat (i64 42) +; FIXED-NEXT: [[TMP5:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], splat (i64 42) ; FIXED-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], splat (i64 27) ; FIXED-NEXT: [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], splat (i64 27) -; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[WIDE_LOAD]], <4 x i64> [[TMP6]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[WIDE_LOAD1]], <4 x i64> [[TMP7]] +; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> [[WIDE_LOAD]] +; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD1]] ; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8 ; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP3]], align 8 ; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 @@ -797,9 +797,9 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq [[WIDE_LOAD]], splat (i64 42) +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne [[WIDE_LOAD]], splat (i64 42) ; CHECK-NEXT: [[TMP10:%.*]] = sdiv [[WIDE_LOAD]], splat (i64 27) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[WIDE_LOAD]], [[TMP10]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP10]], [[WIDE_LOAD]] ; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -840,12 +840,12 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4 ; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42) -; FIXED-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD1]], splat (i64 42) +; FIXED-NEXT: [[TMP4:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], splat (i64 42) +; FIXED-NEXT: [[TMP5:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], splat (i64 42) ; FIXED-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], splat (i64 27) ; FIXED-NEXT: [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], splat (i64 27) -; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[WIDE_LOAD]], <4 x i64> [[TMP6]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[WIDE_LOAD1]], <4 x i64> [[TMP7]] +; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> [[WIDE_LOAD]] +; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD1]] ; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8 ; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP3]], align 8 ; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 912b814bdd114..2926011857ae9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -398,9 +398,9 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; NO-VP-OUTLOOP-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 ; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load , ptr [[TMP21]], align 4 -; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = icmp sle [[WIDE_MASKED_LOAD]], splat (i32 3) +; NO-VP-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt [[WIDE_MASKED_LOAD]], splat (i32 3) ; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] -; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select [[TMP12]], [[VEC_PHI]], [[TMP16]] +; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select [[TMP18]], [[TMP16]], [[VEC_PHI]] ; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; NO-VP-OUTLOOP-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-VP-OUTLOOP-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -956,9 +956,9 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; NO-VP-OUTLOOP-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0 ; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load , ptr [[TMP27]], align 4 -; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = icmp sle [[WIDE_MASKED_LOAD]], [[VEC_IND]] +; NO-VP-OUTLOOP-NEXT: [[TMP28:%.*]] = icmp sgt [[WIDE_MASKED_LOAD]], [[VEC_IND]] ; NO-VP-OUTLOOP-NEXT: [[TMP22:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] -; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select [[TMP13]], [[VEC_PHI]], [[TMP22]] +; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select [[TMP28]], [[TMP22]], [[VEC_PHI]] ; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; NO-VP-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; NO-VP-OUTLOOP-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll index 7e66fcf8568d3..f4cd48de60243 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -34,10 +34,10 @@ define i32 @test_explicit_pred(i64 %len) { ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4) ; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], splat (i64 4) -; CHECK-NEXT: [[TMP0:%.*]] = icmp sge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP1:%.*]] = icmp sge <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp sge <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4 @@ -47,10 +47,10 @@ define i32 @test_explicit_pred(i64 %len) { ; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 ; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 ; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4 -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD]] -; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD7]] -; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD8]] -; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD9]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[WIDE_LOAD8]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[WIDE_LOAD9]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP16]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[TMP17]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]] ; CHECK-NEXT: [[TMP18]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll index 5ad5cf14aea10..ed1c67c082134 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll @@ -11,9 +11,54 @@ define void @smax_call_uniform(ptr %dst, i64 %x) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[C:%.*]] = icmp ult i8 -68, -69 ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[X]], 0 +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]] +; CHECK: [[PRED_UREM_IF]]: +; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE]] +; CHECK: [[PRED_UREM_CONTINUE]]: +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 +; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]] +; CHECK: [[PRED_UREM_IF1]]: +; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE2]] +; CHECK: [[PRED_UREM_CONTINUE2]]: +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_UREM_IF3:.*]], label %[[PRED_UREM_CONTINUE4:.*]] +; CHECK: [[PRED_UREM_IF3]]: +; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE4]] +; CHECK: [[PRED_UREM_CONTINUE4]]: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UREM_IF5:.*]], label %[[PRED_UREM_CONTINUE6]] +; CHECK: [[PRED_UREM_IF5]]: +; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE6]] +; CHECK: [[PRED_UREM_CONTINUE6]]: +; CHECK-NEXT: [[TMP12:%.*]] = tail call i64 @llvm.smax.i64(i64 0, i64 0) +; CHECK-NEXT: [[TMP13:%.*]] = tail call i64 @llvm.smax.i64(i64 0, i64 0) +; CHECK-NEXT: [[P:%.*]] = select i1 [[C]], i64 1, i64 [[TMP12]] +; CHECK-NEXT: [[PREDPHI7:%.*]] = select i1 [[C]], i64 1, i64 [[TMP13]] +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[P]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[PREDPHI7]], 1 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[DST]], i64 [[ADD]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP17]] +; CHECK-NEXT: store i64 0, ptr [[GEP]], align 8 +; CHECK-NEXT: store i64 0, ptr [[TMP19]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 [[C]], label %[[LOOP_LATCH]], label %[[ELSE:.*]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: [[REM1:%.*]] = urem i64 [[MUL]], [[X]] @@ -26,7 +71,7 @@ define void @smax_call_uniform(ptr %dst, i64 %x) { ; CHECK-NEXT: store i64 0, ptr [[GEP1]], align 8 ; CHECK-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT1]], 1024 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -58,3 +103,9 @@ exit: } declare i64 @llvm.smax.i64(i64, i64) +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion.ll b/llvm/test/Transforms/LoopVectorize/if-conversion.ll index 57109a0996a66..8a7f4a386fda1 100644 --- a/llvm/test/Transforms/LoopVectorize/if-conversion.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion.ll @@ -159,10 +159,10 @@ define i32 @reduction_func(ptr nocapture %A, i32 %n) nounwind uwtable readonly s ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], splat (i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 30) ; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[VEC_PHI]], splat (i32 2) ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[WIDE_LOAD]] -; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP4]] +; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index 7c6bc6aa6ca3a..8a326c9d0c083 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -567,7 +567,7 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <2 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]] @@ -577,7 +577,7 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 2 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i16> poison, i16 [[TMP9]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> [[TMP11]], i16 [[TMP10]], i32 1 -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> zeroinitializer, <2 x i16> [[TMP12]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP12]], <2 x i16> zeroinitializer ; CHECK-NEXT: [[TMP13]] = add <2 x i16> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll index 1eb6b134621da..bd9647188911a 100644 --- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll +++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll @@ -23,7 +23,8 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 10) ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[TMP0]], splat (i32 20) ; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_IND]], <2 x i32> splat (i32 9) +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> splat (i32 9), <2 x i32> splat (i32 9) +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_IND]], <2 x i32> [[PREDPHI]] ; CHECK-NEXT: [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[TMP3]] ; CHECK-NEXT: [[PREDPHI7]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_PHI]], <2 x i32> [[PREDPHI6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll index 9bc231108fa96..d5a206ff21da0 100644 --- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll @@ -21,8 +21,8 @@ define void @single_incoming_phi_no_blend_mask(i64 %a, i64 %b) { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i16> [[WIDE_LOAD]], <2 x i16> splat (i16 1) +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i16> splat (i16 1), <2 x i16> [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0 ; CHECK-NEXT: store <2 x i16> [[PREDPHI]], ptr [[TMP7]], align 2 @@ -107,8 +107,9 @@ define void @single_incoming_phi_with_blend_mask(i64 %a, i64 %b) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i16> [[WIDE_LOAD]], <2 x i16> splat (i16 1) -; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[PREDPHI]], <2 x i16> zeroinitializer +; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP8]], <2 x i16> zeroinitializer, <2 x i16> [[PREDPHI]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0 ; CHECK-NEXT: store <2 x i16> [[PREDPHI1]], ptr [[TMP10]], align 2 @@ -188,8 +189,8 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <2 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <2 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP0]], <2 x i16> [[VEC_IND1]], <2 x i16> [[VEC_IND3]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[VEC_IND3]], <2 x i16> [[VEC_IND1]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1 @@ -296,8 +297,9 @@ define void @single_incoming_needs_predication(i64 %a, i64 %b) { ; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[TMP15:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP16]], <2 x i16> [[TMP14]], <2 x i16> splat (i16 1) -; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> [[PREDPHI]], <2 x i16> zeroinitializer +; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP17]], <2 x i16> zeroinitializer, <2 x i16> [[PREDPHI]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0 ; CHECK-NEXT: store <2 x i16> [[PREDPHI3]], ptr [[TMP19]], align 2 diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 94ee454e971e8..732214aa1449e 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -187,7 +187,7 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) { ; CHECK-NEXT: Successor(s): if.then.0 ; CHECK-EMPTY: ; CHECK-NEXT: if.then.0: -; CHECK-NEXT: EMIT ir<%d> = select ir<%cmp>, vp<[[PRED]]>, ir<0> +; CHECK-NEXT: BLEND ir<%d> = ir<0> vp<[[PRED]]>/ir<%cmp> ; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%d> @@ -376,7 +376,7 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db ; CHECK-NEXT: if.then.0: ; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not ir<%cmp2> ; CHECK-NEXT: EMIT vp<[[SEL2:%.+]]> = logical-and vp<[[NOT1]]>, vp<[[NOT2]]> -; CHECK-NEXT: EMIT ir<%ysd.0> = select vp<[[SEL2]]>, ir<%psd>, vp<[[PHI]]> +; CHECK-NEXT: BLEND ir<%ysd.0> = vp<[[PHI]]> ir<%psd>/vp<[[SEL2]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%isd> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%ysd.0> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> @@ -757,7 +757,7 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) { ; CHECK-EMPTY: ; CHECK-NEXT: if.then.1: ; CHECK-NEXT: WIDEN ir<%fadd> = fadd vp<[[PHI1]]>, vp<[[PHI2]]> -; CHECK-NEXT: EMIT ir<%st.value> = select ir<%ifcond>, ir<%fadd>, ir<%ld.value> +; CHECK-NEXT: BLEND ir<%st.value> = ir<%ld.value> ir<%fadd>/ir<%ifcond> ; CHECK-NEXT: CLONE ir<%st.addr> = getelementptr inbounds ir<%dest>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%st.addr> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%st.value> diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index a330afc45abab..4a4bda254bf88 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -366,7 +366,7 @@ define void @pred_cfg1(i32 %k, i32 %j) { ; CHECK-NEXT: Successor(s): then.0.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0.0: -; CHECK-NEXT: EMIT ir<%p> = select vp<[[MASK2]]>, vp<[[PRED]]>, ir<0> +; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -465,7 +465,7 @@ define void @pred_cfg2(i32 %k, i32 %j) { ; CHECK-NEXT: Successor(s): then.0.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0.0: -; CHECK-NEXT: EMIT ir<%p> = select vp<[[MASK2]]>, vp<[[PRED]]>, ir<0> +; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]> ; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.1> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: @@ -571,7 +571,7 @@ define void @pred_cfg3(i32 %k, i32 %j) { ; CHECK-NEXT: Successor(s): then.0.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0.0: -; CHECK-NEXT: EMIT ir<%p> = select vp<[[MASK2]]>, vp<[[PRED]]>, ir<0> +; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]> ; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.0> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: