diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 63fccee63c0ae..1dff9c3513a28 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -163,6 +163,7 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, case Intrinsic::is_fpclass: case Intrinsic::vp_is_fpclass: case Intrinsic::powi: + case Intrinsic::vector_extract: return (ScalarOpdIdx == 1); case Intrinsic::smul_fix: case Intrinsic::smul_fix_sat: @@ -195,6 +196,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( case Intrinsic::vp_llrint: case Intrinsic::ucmp: case Intrinsic::scmp: + case Intrinsic::vector_extract: return OpdIdx == -1 || OpdIdx == 0; case Intrinsic::modf: case Intrinsic::sincos: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 144f35e10132f..dd54d964f8883 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -44,6 +44,7 @@ class VPRecipeBuilder; struct VFRange; extern cl::opt EnableVPlanNativePath; +extern cl::opt EnableWideActiveLaneMask; extern cl::opt ForceTargetInstructionCost; /// VPlan-based builder utility analogous to IRBuilder. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e7bae17dd2ceb..6e5f4caf93d23 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -356,6 +356,10 @@ cl::opt llvm::EnableVPlanNativePath( cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization.")); +cl::opt llvm::EnableWideActiveLaneMask( + "enable-wide-lane-mask", cl::init(false), cl::Hidden, + cl::desc("Enable use of wide get active lane mask instructions")); + cl::opt llvm::VerifyEachVPlan("vplan-verify-each", #ifdef EXPENSIVE_CHECKS @@ -7328,7 +7332,10 @@ DenseMap LoopVectorizationPlanner::executePlan( VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator, BestVPlan, BestVF, VScale); } - VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); + VPlanTransforms::optimizeForVFAndUF( + BestVPlan, BestVF, BestUF, PSE, + ILV.Cost->getTailFoldingStyle() == + TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck); VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType()); VPlanTransforms::narrowInterleaveGroups( BestVPlan, BestVF, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 356af4a0e74e4..6080aa88ec306 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -954,6 +954,9 @@ class VPInstruction : public VPRecipeWithIRFlags, // part if it is scalar. In the latter case, the recipe will be removed // during unrolling. ExtractPenultimateElement, + // Extracts a subvector from a vector (first operand) starting at a given + // offset (second operand). + ExtractSubvector, LogicalAnd, // Non-poison propagating logical And. // Add an offset in bytes (second operand) to a base pointer (first // operand). Only generates scalar values (either for the first lane only or @@ -1887,6 +1890,9 @@ class VPHeaderPHIRecipe : public VPSingleDefRecipe, public VPPhiAccessors { return getOperand(1); } + // Update the incoming value from the loop backedge. + void setBackedgeValue(VPValue *V) { setOperand(1, V); } + /// Returns the backedge value as a recipe. The backedge value is guaranteed /// to be a recipe. virtual VPRecipeBase &getBackedgeRecipe() { @@ -3234,10 +3240,12 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { /// TODO: It would be good to use the existing VPWidenPHIRecipe instead and /// remove VPActiveLaneMaskPHIRecipe. class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { + unsigned UnrollPart = 0; + public: - VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) - : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask, - DL) {} + VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL, unsigned Part = 0) + : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask, DL), + UnrollPart(Part) {} ~VPActiveLaneMaskPHIRecipe() override = default; @@ -3250,6 +3258,9 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC) + unsigned getUnrollPart() { return UnrollPart; } + void setUnrollPart(unsigned Part) { UnrollPart = Part; } + /// Generate the active lane mask phi of the vector loop. void execute(VPTransformState &State) override; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 92db9674ef42b..5e7f797b70978 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -74,6 +74,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { switch (Opcode) { case Instruction::ExtractElement: case Instruction::Freeze: + case VPInstruction::ExtractSubvector: case VPInstruction::ReductionStartVector: return inferScalarType(R->getOperand(0)); case Instruction::Select: { diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index efea99f22d086..62898bf2c1991 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -384,10 +384,11 @@ m_Broadcast(const Op0_t &Op0) { return m_VPInstruction(Op0); } -template -inline BinaryVPInstruction_match -m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) { - return m_VPInstruction(Op0, Op1); +template +inline TernaryVPInstruction_match +m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { + return m_VPInstruction(Op0, Op1, Op2); } template diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ccb7512051d77..c776d5cb91278 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -469,15 +469,16 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case Instruction::ICmp: case Instruction::FCmp: case Instruction::Store: - case VPInstruction::ActiveLaneMask: case VPInstruction::BranchOnCount: case VPInstruction::ComputeReductionResult: + case VPInstruction::ExtractSubvector: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::PtrAdd: case VPInstruction::WideIVStep: return 2; case Instruction::Select: + case VPInstruction::ActiveLaneMask: case VPInstruction::ComputeAnyOfResult: case VPInstruction::ReductionStartVector: return 3; @@ -614,7 +615,9 @@ Value *VPInstruction::generate(VPTransformState &State) { Name); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = VectorType::get(Int1Ty, State.VF); + auto PredTy = VectorType::get( + Int1Ty, State.VF * cast(getOperand(2)->getLiveInIRValue()) + ->getZExtValue()); return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, Name); @@ -846,6 +849,14 @@ Value *VPInstruction::generate(VPTransformState &State) { Res->setName(Name); return Res; } + case VPInstruction::ExtractSubvector: { + Value *Vec = State.get(getOperand(0)); + assert(State.VF.isVector()); + auto Idx = cast(getOperand(1)->getLiveInIRValue()); + auto ResTy = VectorType::get( + State.TypeAnalysis.inferScalarType(getOperand(0)), State.VF); + return Builder.CreateExtractVector(ResTy, Vec, Idx); + } case VPInstruction::LogicalAnd: { Value *A = State.get(getOperand(0)); Value *B = State.get(getOperand(1)); @@ -1044,6 +1055,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExtractLastElement: case VPInstruction::ExtractPenultimateElement: + case VPInstruction::ExtractSubvector: case VPInstruction::FirstActiveLane: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: @@ -1174,6 +1186,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ExtractPenultimateElement: O << "extract-penultimate-element"; break; + case VPInstruction::ExtractSubvector: + O << "extract-subvector"; + break; case VPInstruction::ComputeAnyOfResult: O << "compute-anyof-result"; break; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 90137b72c83fb..b8f14ca88e8a3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "VPlanTransforms.h" +#include "LoopVectorizationPlanner.h" #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanAnalysis.h" @@ -1432,20 +1433,93 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C); } +static void extractFromWideActiveLaneMask(VPlan &Plan, ElementCount VF, + unsigned UF) { + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + auto *Header = cast(VectorRegion->getEntry()); + VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); + auto *Term = &ExitingVPBB->back(); + + VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); + LLVMContext &Ctx = CanonicalIV->getScalarType()->getContext(); + using namespace llvm::VPlanPatternMatch; + + auto extractFromALM = [&](VPInstruction *ALM, VPInstruction *InsBefore, + SmallVectorImpl &Extracts) { + VPBuilder Builder(InsBefore); + DebugLoc DL = ALM->getDebugLoc(); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector Ops; + Ops.append({ALM, Plan.getOrAddLiveIn( + ConstantInt::get(IntegerType::getInt64Ty(Ctx), + VF.getKnownMinValue() * Part))}); + Extracts.push_back( + Builder.createNaryOp(VPInstruction::ExtractSubvector, Ops, DL)); + } + }; + + // Create a list of each active lane mask phi, ordered by unroll part. + SmallVector Phis(UF, nullptr); + for (VPRecipeBase &R : Header->phis()) + if (auto *Phi = dyn_cast(&R)) + Phis[Phi->getUnrollPart()] = Phi; + + assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) && + "Expected one VPActiveLaneMaskPHIRecipe for each unroll part"); + + // When using wide lane masks, the return type of the get.active.lane.mask + // intrinsic is VF x UF (second operand). + VPValue *ALMMultiplier = + Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF)); + cast(Phis[0]->getStartValue())->setOperand(2, ALMMultiplier); + cast(Phis[0]->getBackedgeValue()) + ->setOperand(2, ALMMultiplier); + + // Create UF x extract vectors and insert into preheader. + SmallVector EntryExtracts; + auto *EntryALM = cast(Phis[0]->getStartValue()); + extractFromALM(EntryALM, cast(&EntryALM->getParent()->back()), + EntryExtracts); + + // Create UF x extract vectors and insert before the loop compare & branch, + // updating the compare to use the first extract. + SmallVector LoopExtracts; + auto *LoopALM = cast(Phis[0]->getBackedgeValue()); + VPInstruction *Not = cast(Term->getOperand(0)); + extractFromALM(LoopALM, Not, LoopExtracts); + Not->setOperand(0, LoopExtracts[0]); + + // Update the incoming values of active lane mask phis. + for (unsigned Part = 0; Part < UF; ++Part) { + Phis[Part]->setStartValue(EntryExtracts[Part]); + Phis[Part]->setBackedgeValue(LoopExtracts[Part]); + } + + return; +} + /// Try to simplify the branch condition of \p Plan. This may restrict the /// resulting plan to \p BestVF and \p BestUF. -static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, - unsigned BestUF, - PredicatedScalarEvolution &PSE) { +static bool +simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, + unsigned BestUF, + PredicatedScalarEvolution &PSE, + bool DataAndControlFlowWithoutRuntimeCheck) { VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); auto *Term = &ExitingVPBB->back(); VPValue *Cond; ScalarEvolution &SE = *PSE.getSE(); using namespace llvm::VPlanPatternMatch; - if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) || - match(Term, m_BranchOnCond( - m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) { + auto *Header = cast(VectorRegion->getEntry()); + bool BranchALM = match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( + m_VPValue(), m_VPValue(), m_VPValue())))); + + if (BranchALM || match(Term, m_BranchOnCount(m_VPValue(), m_VPValue()))) { + if (BranchALM && DataAndControlFlowWithoutRuntimeCheck && + EnableWideActiveLaneMask && BestVF.isVector() && BestUF > 1) + extractFromWideActiveLaneMask(Plan, BestVF, BestUF); + // Try to simplify the branch condition if TC <= VF * UF when the latch // terminator is BranchOnCount or BranchOnCond where the input is // Not(ActiveLaneMask). @@ -1470,7 +1544,6 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, // The vector loop region only executes once. If possible, completely remove // the region, otherwise replace the terminator controlling the latch with // (BranchOnCond true). - auto *Header = cast(VectorRegion->getEntry()); auto *CanIVTy = Plan.getCanonicalIV()->getScalarType(); if (all_of( Header->phis(), @@ -1507,14 +1580,15 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, return true; } -void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, - unsigned BestUF, - PredicatedScalarEvolution &PSE) { +void VPlanTransforms::optimizeForVFAndUF( + VPlan &Plan, ElementCount BestVF, unsigned BestUF, + PredicatedScalarEvolution &PSE, + bool DataAndControlFlowWithoutRuntimeCheck) { assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); - bool MadeChange = - simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); + bool MadeChange = simplifyBranchConditionForVFAndUF( + Plan, BestVF, BestUF, PSE, DataAndControlFlowWithoutRuntimeCheck); MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF); if (MadeChange) { @@ -2006,9 +2080,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "index.part.next"); // Create the active lane mask instruction in the VPlan preheader. - auto *EntryALM = - Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, - DL, "active.lane.mask.entry"); + VPValue *ALMMultiplier = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, + {EntryIncrement, TC, ALMMultiplier}, DL, + "active.lane.mask.entry"); // Now create the ActiveLaneMaskPhi recipe in the main loop using the // preheader ActiveLaneMask instruction. @@ -2023,8 +2099,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, {IncrementValue}, {false, false}, DL); auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, - {InLoopIncrement, TripCount}, DL, - "active.lane.mask.next"); + {InLoopIncrement, TripCount, ALMMultiplier}, + DL, "active.lane.mask.next"); LaneMaskPhi->addOperand(ALM); // Replace the original terminator with BranchOnCond. We have to invert the @@ -2101,9 +2177,12 @@ void VPlanTransforms::addActiveLaneMask( Plan, DataAndControlFlowWithoutRuntimeCheck); } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); - LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, - {WideCanonicalIV, Plan.getTripCount()}, nullptr, - "active.lane.mask"); + VPValue *ALMMultiplier = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + LaneMask = + B.createNaryOp(VPInstruction::ActiveLaneMask, + {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, + nullptr, "active.lane.mask"); } // Walk users of WideCanonicalIV and replace all compares of the form diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 8d2eded45da22..920c7aa32cc97 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -109,7 +109,8 @@ struct VPlanTransforms { /// resulting plan to \p BestVF and \p BestUF. static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, - PredicatedScalarEvolution &PSE); + PredicatedScalarEvolution &PSE, + bool DataAndControlFlowWithoutRuntimeCheck); /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe /// optimizations, dead recipe removal, replicate region optimizations and diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 2dd43c092ff7a..76a37d5ba839b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -250,6 +250,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, } else { assert(isa(R) && "unexpected header phi recipe not needing unrolled part"); + cast(Copy)->setUnrollPart(Part); } } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 81bd21bb904c0..9fdc199fc1dfa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -61,7 +61,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { VPValue *A, *B; using namespace VPlanPatternMatch; - if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B)))) + if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1)))) return B == Plan.getTripCount() && (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()), m_SpecificInt(1), diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll new file mode 100644 index 0000000000000..449fc6f7af7c1 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --version 4 +; RUN: opt -S -passes=loop-vectorize -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \ +; RUN: -force-vector-width=4 -force-vector-interleave=0 < %s | FileCheck %s -check-prefix CHECK-UF0 +; RUN: opt -S --passes=loop-vectorize -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \ +; RUN: -force-vector-width=4 -force-vector-interleave=2 < %s | FileCheck %s -check-prefix CHECK-UF2 +; RUN: opt -S --passes=loop-vectorize -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \ +; RUN: -force-vector-width=4 -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4 + +target triple = "aarch64-unknown-linux" + +define void @fixed_wide_active_lane_mask(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 { +; CHECK-UF0-LABEL: define void @fixed_wide_active_lane_mask( +; CHECK-UF0-SAME: ptr noalias [[DST:%.*]], ptr noalias readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-UF0-NEXT: entry: +; CHECK-UF0-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK-UF0: vector.ph: +; CHECK-UF0-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 3 +; CHECK-UF0-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; CHECK-UF0-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-UF0-NEXT: [[TMP0:%.*]] = sub i64 [[N]], 4 +; CHECK-UF0-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], 4 +; CHECK-UF0-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0 +; CHECK-UF0-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[N]]) +; CHECK-UF0-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UF0: vector.body: +; CHECK-UF0-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF0-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF0-NEXT: [[TMP3:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-UF0-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 +; CHECK-UF0-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-UF0-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-UF0-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; CHECK-UF0-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-UF0-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-UF0-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) +; CHECK-UF0-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; CHECK-UF0-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0 +; CHECK-UF0-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UF0: middle.block: +; +; CHECK-UF2-LABEL: define void @fixed_wide_active_lane_mask( +; CHECK-UF2-SAME: ptr noalias [[DST:%.*]], ptr noalias readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-UF2-NEXT: entry: +; CHECK-UF2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK-UF2: vector.ph: +; CHECK-UF2-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 7 +; CHECK-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8 +; CHECK-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-UF2-NEXT: [[TMP0:%.*]] = sub i64 [[N]], 8 +; CHECK-UF2-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[N]], 8 +; CHECK-UF2-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP0]], i64 0 +; CHECK-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 [[N]]) +; CHECK-UF2-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.vector.extract.v4i1.v8i1(<8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-UF2-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.vector.extract.v4i1.v8i1(<8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 4) +; CHECK-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 4, i64 [[N]]) +; CHECK-UF2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UF2: vector.body: +; CHECK-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF2-NEXT: [[TMP10:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0 +; CHECK-UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-UF2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-UF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; CHECK-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 4 +; CHECK-UF2-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-UF2-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]]) +; CHECK-UF2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-UF2-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 4 +; CHECK-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; CHECK-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT3:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP9]], i64 [[TMP4]]) +; CHECK-UF2-NEXT: [[TMP5]] = call <4 x i1> @llvm.vector.extract.v4i1.v8i1(<8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-UF2-NEXT: [[TMP6]] = call <4 x i1> @llvm.vector.extract.v4i1.v8i1(<8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 4) +; CHECK-UF2-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) +; CHECK-UF2-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0 +; CHECK-UF2-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UF2: middle.block: +; +; CHECK-UF4-LABEL: define void @fixed_wide_active_lane_mask( +; CHECK-UF4-SAME: ptr noalias [[DST:%.*]], ptr noalias readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-UF4-NEXT: entry: +; CHECK-UF4-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK-UF4: vector.ph: +; CHECK-UF4-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15 +; CHECK-UF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; CHECK-UF4-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-UF4-NEXT: [[TMP0:%.*]] = sub i64 [[N]], 16 +; CHECK-UF4-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], 16 +; CHECK-UF4-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP0]], i64 0 +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[N]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 4, i64 [[N]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 8, i64 [[N]]) +; CHECK-UF4-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-UF4-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 4) +; CHECK-UF4-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 8) +; CHECK-UF4-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 12) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 12, i64 [[N]]) +; CHECK-UF4-NEXT: br label [[VECTOR_BODY1:%.*]] +; CHECK-UF4: vector.body: +; CHECK-UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[TMP1]], [[ENTRY]] ], [ [[TMP9:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ], [ [[TMP10:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = phi <4 x i1> [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi <4 x i1> [ [[TMP4]], [[ENTRY]] ], [ [[TMP12:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[TMP7:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-UF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; CHECK-UF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-UF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-UF4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-UF4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 4 +; CHECK-UF4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 8 +; CHECK-UF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 12 +; CHECK-UF4-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP16]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-UF4-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK4]]) +; CHECK-UF4-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP18]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK5]]) +; CHECK-UF4-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP19]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK6]]) +; CHECK-UF4-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-UF4-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 4 +; CHECK-UF4-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 8 +; CHECK-UF4-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 12 +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT7:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP13]], i64 [[TMP6]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT8:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP14]], i64 [[TMP6]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT9:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP15]], i64 [[TMP6]]) +; CHECK-UF4-NEXT: [[TMP9]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-UF4-NEXT: [[TMP10]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 4) +; CHECK-UF4-NEXT: [[TMP11]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 8) +; CHECK-UF4-NEXT: [[TMP12]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 12) +; CHECK-UF4-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP9]], splat (i1 true) +; CHECK-UF4-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0 +; CHECK-UF4-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UF4: middle.block: +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %ld = load i32, ptr %src + %arrayidx = getelementptr inbounds i32, ptr %dst, i64 %iv + store i32 %ld, ptr %arrayidx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +attributes #0 = { nounwind "target-features"="+neon,+sve" } + +;. +; CHECK-UF0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-UF0: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-UF0: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +;. +; CHECK-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +;. +; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll new file mode 100644 index 0000000000000..4ded9ab1bad89 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll @@ -0,0 +1,465 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --version 4 +; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=0 < %s | FileCheck %s -check-prefix CHECK-SVE-UF0 +; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=2 < %s | FileCheck %s -check-prefix CHECK-SVE-UF2 +; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-SVE-UF4 + +target triple = "aarch64-unknown-linux" + +define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src, i64 %n) #0 { +; CHECK-SVE-UF0-LABEL: define void @scalable_wide_active_lane_mask( +; CHECK-SVE-UF0-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE-UF0-NEXT: entry: +; CHECK-SVE-UF0-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-SVE-UF0-NEXT: br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF0: for.body.preheader: +; CHECK-SVE-UF0-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-SVE-UF0: vector.ph: +; CHECK-SVE-UF0-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF0-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP3]], 16 +; CHECK-SVE-UF0-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-SVE-UF0-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]] +; CHECK-SVE-UF0-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-SVE-UF0-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-SVE-UF0-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF0-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP5]], 16 +; CHECK-SVE-UF0-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF0-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 16 +; CHECK-SVE-UF0-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP18]] +; CHECK-SVE-UF0-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP18]] +; CHECK-SVE-UF0-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) +; CHECK-SVE-UF0-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF0: vector.body: +; CHECK-SVE-UF0-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH1]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF0-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF0-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-SVE-UF0-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP11]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF0-NEXT: [[TMP6:%.*]] = mul [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-SVE-UF0-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF0-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; CHECK-SVE-UF0-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP6]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF0-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP12]] +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-SVE-UF0-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; CHECK-SVE-UF0-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 +; CHECK-SVE-UF0-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE-UF0: middle.block: +; +; CHECK-SVE-UF2-LABEL: define void @scalable_wide_active_lane_mask( +; CHECK-SVE-UF2-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE-UF2-NEXT: entry: +; CHECK-SVE-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-SVE-UF2-NEXT: br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF2: for.body.preheader: +; CHECK-SVE-UF2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-SVE-UF2: vector.ph: +; CHECK-SVE-UF2-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP3]], 32 +; CHECK-SVE-UF2-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-SVE-UF2-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]] +; CHECK-SVE-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-SVE-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-SVE-UF2-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 32 +; CHECK-SVE-UF2-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP34:%.*]] = mul nuw i64 [[TMP33]], 32 +; CHECK-SVE-UF2-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP34]] +; CHECK-SVE-UF2-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP34]] +; CHECK-SVE-UF2-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-SVE-UF2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 16 +; CHECK-SVE-UF2-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP11]] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[N]]) +; CHECK-SVE-UF2-NEXT: [[TMP12:%.*]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-SVE-UF2-NEXT: [[TMP19:%.*]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 16) +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-SVE-UF2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF2: vector.body: +; CHECK-SVE-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP12]], [[VECTOR_PH1]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[TMP19]], [[VECTOR_PH1]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i32 0 +; CHECK-SVE-UF2-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 16 +; CHECK-SVE-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 [[TMP17]] +; CHECK-SVE-UF2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP15]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF2-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-SVE-UF2-NEXT: [[TMP13:%.*]] = mul [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-SVE-UF2-NEXT: [[TMP14:%.*]] = mul [[WIDE_MASKED_LOAD3]], splat (i8 3) +; CHECK-SVE-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0 +; CHECK-SVE-UF2-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 16 +; CHECK-SVE-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i64 [[TMP24]] +; CHECK-SVE-UF2-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP13]], ptr [[TMP22]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF2-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP14]], ptr [[TMP25]], i32 1, [[ACTIVE_LANE_MASK2]]) +; CHECK-SVE-UF2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-SVE-UF2-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 16 +; CHECK-SVE-UF2-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], [[TMP27]] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT4:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP28]], i64 [[TMP9]]) +; CHECK-SVE-UF2-NEXT: [[TMP29]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-SVE-UF2-NEXT: [[TMP30]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_NEXT]], i64 16) +; CHECK-SVE-UF2-NEXT: [[TMP31:%.*]] = xor [[TMP29]], splat (i1 true) +; CHECK-SVE-UF2-NEXT: [[TMP32:%.*]] = extractelement [[TMP31]], i32 0 +; CHECK-SVE-UF2-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE-UF2: middle.block: +; +; CHECK-SVE-UF4-LABEL: define void @scalable_wide_active_lane_mask( +; CHECK-SVE-UF4-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE-UF4-NEXT: entry: +; CHECK-SVE-UF4-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-SVE-UF4-NEXT: br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF4: for.body.preheader: +; CHECK-SVE-UF4-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-SVE-UF4: vector.ph: +; CHECK-SVE-UF4-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP3]], 64 +; CHECK-SVE-UF4-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-SVE-UF4-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]] +; CHECK-SVE-UF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-SVE-UF4-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-SVE-UF4-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 64 +; CHECK-SVE-UF4-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP62:%.*]] = mul nuw i64 [[TMP61]], 64 +; CHECK-SVE-UF4-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP62]] +; CHECK-SVE-UF4-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP62]] +; CHECK-SVE-UF4-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-SVE-UF4-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 16 +; CHECK-SVE-UF4-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP11]] +; CHECK-SVE-UF4-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 32 +; CHECK-SVE-UF4-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP13]] +; CHECK-SVE-UF4-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 48 +; CHECK-SVE-UF4-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP15]] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 0, i64 [[N]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) +; CHECK-SVE-UF4-NEXT: [[TMP16:%.*]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-SVE-UF4-NEXT: [[TMP17:%.*]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 16) +; CHECK-SVE-UF4-NEXT: [[TMP18:%.*]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 32) +; CHECK-SVE-UF4-NEXT: [[TMP19:%.*]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 48) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) +; CHECK-SVE-UF4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF4: vector.body: +; CHECK-SVE-UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP16]], [[VECTOR_PH1]] ], [ [[TMP55:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[TMP17]], [[VECTOR_PH1]] ], [ [[TMP56:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[TMP18]], [[VECTOR_PH1]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[TMP19]], [[VECTOR_PH1]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i32 0 +; CHECK-SVE-UF4-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 16 +; CHECK-SVE-UF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 [[TMP23]] +; CHECK-SVE-UF4-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP32:%.*]] = mul nuw i64 [[TMP31]], 32 +; CHECK-SVE-UF4-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 [[TMP32]] +; CHECK-SVE-UF4-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP34]], 48 +; CHECK-SVE-UF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 [[TMP29]] +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP21]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP33]], i32 1, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP30]], i32 1, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-SVE-UF4-NEXT: [[TMP25:%.*]] = mul [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-SVE-UF4-NEXT: [[TMP26:%.*]] = mul [[WIDE_MASKED_LOAD9]], splat (i8 3) +; CHECK-SVE-UF4-NEXT: [[TMP27:%.*]] = mul [[WIDE_MASKED_LOAD10]], splat (i8 3) +; CHECK-SVE-UF4-NEXT: [[TMP28:%.*]] = mul [[WIDE_MASKED_LOAD11]], splat (i8 3) +; CHECK-SVE-UF4-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF4-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i32 0 +; CHECK-SVE-UF4-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], 16 +; CHECK-SVE-UF4-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[TMP38]] +; CHECK-SVE-UF4-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP41:%.*]] = mul nuw i64 [[TMP40]], 32 +; CHECK-SVE-UF4-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[TMP41]] +; CHECK-SVE-UF4-NEXT: [[TMP43:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP44:%.*]] = mul nuw i64 [[TMP43]], 48 +; CHECK-SVE-UF4-NEXT: [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[TMP44]] +; CHECK-SVE-UF4-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP25]], ptr [[TMP36]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF4-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP26]], ptr [[TMP39]], i32 1, [[ACTIVE_LANE_MASK6]]) +; CHECK-SVE-UF4-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP27]], ptr [[TMP42]], i32 1, [[ACTIVE_LANE_MASK7]]) +; CHECK-SVE-UF4-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP28]], ptr [[TMP45]], i32 1, [[ACTIVE_LANE_MASK8]]) +; CHECK-SVE-UF4-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-SVE-UF4-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP47:%.*]] = mul nuw i64 [[TMP46]], 16 +; CHECK-SVE-UF4-NEXT: [[TMP48:%.*]] = add i64 [[INDEX]], [[TMP47]] +; CHECK-SVE-UF4-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP50:%.*]] = mul nuw i64 [[TMP49]], 32 +; CHECK-SVE-UF4-NEXT: [[TMP51:%.*]] = add i64 [[INDEX]], [[TMP50]] +; CHECK-SVE-UF4-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP53:%.*]] = mul nuw i64 [[TMP52]], 48 +; CHECK-SVE-UF4-NEXT: [[TMP54:%.*]] = add i64 [[INDEX]], [[TMP53]] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT12:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP48]], i64 [[TMP9]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT13:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP51]], i64 [[TMP9]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT14:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP54]], i64 [[TMP9]]) +; CHECK-SVE-UF4-NEXT: [[TMP55]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-SVE-UF4-NEXT: [[TMP56]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_NEXT]], i64 16) +; CHECK-SVE-UF4-NEXT: [[TMP57]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_NEXT]], i64 32) +; CHECK-SVE-UF4-NEXT: [[TMP58]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_NEXT]], i64 48) +; CHECK-SVE-UF4-NEXT: [[TMP59:%.*]] = xor [[TMP55]], splat (i1 true) +; CHECK-SVE-UF4-NEXT: [[TMP60:%.*]] = extractelement [[TMP59]], i32 0 +; CHECK-SVE-UF4-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE-UF4: middle.block: +; +entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx1 = getelementptr inbounds i8, ptr %src, i64 %iv + %ld = load i8, ptr %arrayidx1 + %mul = mul i8 %ld, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %dst, i64 %iv + store i8 %mul, ptr %arrayidx2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +define void @scalable_wide_active_lane_mask_float(ptr noalias %dst, ptr readonly %src, i32 %n) #0 { +; CHECK-SVE-UF0-LABEL: define void @scalable_wide_active_lane_mask_float( +; CHECK-SVE-UF0-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-UF0-NEXT: entry: +; CHECK-SVE-UF0-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE-UF0-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF0: for.body.preheader: +; CHECK-SVE-UF0-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE-UF0-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-UF0: vector.ph: +; CHECK-SVE-UF0-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF0-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP7]], 2 +; CHECK-SVE-UF0-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-SVE-UF0-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]] +; CHECK-SVE-UF0-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-SVE-UF0-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-SVE-UF0-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF0-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP12]], 2 +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF0-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF0: vector.body: +; CHECK-SVE-UF0-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF0-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF0-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 0 +; CHECK-SVE-UF0-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP6]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF0-NEXT: [[TMP3:%.*]] = fmul [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00) +; CHECK-SVE-UF0-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF0-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[TMP8]], i32 0 +; CHECK-SVE-UF0-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP3]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF0-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF0-NEXT: [[TMP10:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; CHECK-SVE-UF0-NEXT: [[TMP11:%.*]] = extractelement [[TMP10]], i32 0 +; CHECK-SVE-UF0-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-SVE-UF0: middle.block: +; +; CHECK-SVE-UF2-LABEL: define void @scalable_wide_active_lane_mask_float( +; CHECK-SVE-UF2-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-UF2-NEXT: entry: +; CHECK-SVE-UF2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE-UF2-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF2: for.body.preheader: +; CHECK-SVE-UF2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE-UF2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-UF2: vector.ph: +; CHECK-SVE-UF2-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP3]], 4 +; CHECK-SVE-UF2-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-SVE-UF2-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]] +; CHECK-SVE-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-SVE-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-SVE-UF2-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP28]], 4 +; CHECK-SVE-UF2-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2 +; CHECK-SVE-UF2-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF2-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-SVE-UF2-NEXT: [[TMP14:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF2: vector.body: +; CHECK-SVE-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[TMP14]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i32 0 +; CHECK-SVE-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2 +; CHECK-SVE-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP12]] +; CHECK-SVE-UF2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF2-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-SVE-UF2-NEXT: [[TMP8:%.*]] = fmul [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00) +; CHECK-SVE-UF2-NEXT: [[TMP9:%.*]] = fmul [[WIDE_MASKED_LOAD3]], splat (double 3.000000e+00) +; CHECK-SVE-UF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i32 0 +; CHECK-SVE-UF2-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 2 +; CHECK-SVE-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i64 [[TMP19]] +; CHECK-SVE-UF2-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP8]], ptr [[TMP17]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF2-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP9]], ptr [[TMP20]], i32 8, [[ACTIVE_LANE_MASK2]]) +; CHECK-SVE-UF2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; CHECK-SVE-UF2-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 2 +; CHECK-SVE-UF2-NEXT: [[TMP23:%.*]] = add i64 [[INDEX_NEXT]], [[TMP22]] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT4:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP23]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF2-NEXT: [[TMP24]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-SVE-UF2-NEXT: [[TMP25]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; CHECK-SVE-UF2-NEXT: [[TMP26:%.*]] = xor [[TMP24]], splat (i1 true) +; CHECK-SVE-UF2-NEXT: [[TMP27:%.*]] = extractelement [[TMP26]], i32 0 +; CHECK-SVE-UF2-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-SVE-UF2: middle.block: +; +; CHECK-SVE-UF4-LABEL: define void @scalable_wide_active_lane_mask_float( +; CHECK-SVE-UF4-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-UF4-NEXT: entry: +; CHECK-SVE-UF4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE-UF4-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF4: for.body.preheader: +; CHECK-SVE-UF4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE-UF4-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-UF4: vector.ph: +; CHECK-SVE-UF4-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP3]], 8 +; CHECK-SVE-UF4-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-SVE-UF4-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]] +; CHECK-SVE-UF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-SVE-UF4-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-SVE-UF4-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP26]], 8 +; CHECK-SVE-UF4-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2 +; CHECK-SVE-UF4-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] +; CHECK-SVE-UF4-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 +; CHECK-SVE-UF4-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] +; CHECK-SVE-UF4-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 6 +; CHECK-SVE-UF4-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[TMP11:%.*]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-SVE-UF4-NEXT: [[TMP12:%.*]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; CHECK-SVE-UF4-NEXT: [[TMP13:%.*]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 4) +; CHECK-SVE-UF4-NEXT: [[TMP14:%.*]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 6) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF4: vector.body: +; CHECK-SVE-UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP11]], [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[TMP12]], [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[TMP13]], [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[TMP14]], [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF4-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i32 0 +; CHECK-SVE-UF4-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP28:%.*]] = mul nuw i64 [[TMP27]], 2 +; CHECK-SVE-UF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP28]] +; CHECK-SVE-UF4-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4 +; CHECK-SVE-UF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP21]] +; CHECK-SVE-UF4-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 6 +; CHECK-SVE-UF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP24]] +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP56]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP29]], i32 8, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP22]], i32 8, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP25]], i32 8, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-SVE-UF4-NEXT: [[TMP16:%.*]] = fmul [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00) +; CHECK-SVE-UF4-NEXT: [[TMP17:%.*]] = fmul [[WIDE_MASKED_LOAD9]], splat (double 3.000000e+00) +; CHECK-SVE-UF4-NEXT: [[TMP18:%.*]] = fmul [[WIDE_MASKED_LOAD10]], splat (double 3.000000e+00) +; CHECK-SVE-UF4-NEXT: [[TMP19:%.*]] = fmul [[WIDE_MASKED_LOAD11]], splat (double 3.000000e+00) +; CHECK-SVE-UF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, ptr [[TMP30]], i32 0 +; CHECK-SVE-UF4-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP33:%.*]] = mul nuw i64 [[TMP32]], 2 +; CHECK-SVE-UF4-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, ptr [[TMP30]], i64 [[TMP33]] +; CHECK-SVE-UF4-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP36:%.*]] = mul nuw i64 [[TMP35]], 4 +; CHECK-SVE-UF4-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, ptr [[TMP30]], i64 [[TMP36]] +; CHECK-SVE-UF4-NEXT: [[TMP38:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP39:%.*]] = mul nuw i64 [[TMP38]], 6 +; CHECK-SVE-UF4-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, ptr [[TMP30]], i64 [[TMP39]] +; CHECK-SVE-UF4-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP16]], ptr [[TMP31]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF4-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP17]], ptr [[TMP34]], i32 8, [[ACTIVE_LANE_MASK6]]) +; CHECK-SVE-UF4-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP18]], ptr [[TMP37]], i32 8, [[ACTIVE_LANE_MASK7]]) +; CHECK-SVE-UF4-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP19]], ptr [[TMP40]], i32 8, [[ACTIVE_LANE_MASK8]]) +; CHECK-SVE-UF4-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; CHECK-SVE-UF4-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP42:%.*]] = mul nuw i64 [[TMP41]], 2 +; CHECK-SVE-UF4-NEXT: [[TMP43:%.*]] = add i64 [[INDEX_NEXT]], [[TMP42]] +; CHECK-SVE-UF4-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP45:%.*]] = mul nuw i64 [[TMP44]], 4 +; CHECK-SVE-UF4-NEXT: [[TMP46:%.*]] = add i64 [[INDEX_NEXT]], [[TMP45]] +; CHECK-SVE-UF4-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP48:%.*]] = mul nuw i64 [[TMP47]], 6 +; CHECK-SVE-UF4-NEXT: [[TMP49:%.*]] = add i64 [[INDEX_NEXT]], [[TMP48]] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT12:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP43]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT13:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP46]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT14:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP49]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[TMP50]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-SVE-UF4-NEXT: [[TMP51]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; CHECK-SVE-UF4-NEXT: [[TMP52]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 4) +; CHECK-SVE-UF4-NEXT: [[TMP53]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 6) +; CHECK-SVE-UF4-NEXT: [[TMP54:%.*]] = xor [[TMP50]], splat (i1 true) +; CHECK-SVE-UF4-NEXT: [[TMP55:%.*]] = extractelement [[TMP54]], i32 0 +; CHECK-SVE-UF4-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-SVE-UF4: middle.block: +; +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.body: + %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ] + %arrayidx1 = getelementptr inbounds double, ptr %src, i64 %iv + %ld = load double, ptr %arrayidx1 + %mul = fmul double %ld, 3.000000e+00 + %arrayidx2 = getelementptr inbounds double, ptr %dst, i64 %iv + store double %mul, ptr %arrayidx2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" } + +;. +; CHECK-SVE-UF0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE-UF0: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE-UF0: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE-UF0: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +;. +; CHECK-SVE-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE-UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +;. +; CHECK-SVE-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE-UF4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll new file mode 100644 index 0000000000000..d20216654c3b4 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -S | FileCheck %s + +target triple = "thumbv8.1m.main-arm-unknown-eabihf" + +define void @f0(ptr noalias %dst, ptr readonly %src, i64 %n) #0 { +; CHECK-LABEL: define void @f0( +; CHECK-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[VAL]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 31 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 32 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[TMP0]], i64 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]], <16 x i8> poison) +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-NEXT: [[TMP5:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD2]], splat (i8 3) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16 +; CHECK-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP4]], ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP5]], ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[FOR_END_LOOPEXIT:.*]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[TMP10]], 3 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i8 [[MUL]], ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void +; +entry: + %val = icmp sgt i64 %n, 0 + br i1 %val, label %for.body, label %for.end + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv + %0 = load i8, ptr %arrayidx, align 1 + %mul = mul i8 %0, 3 + %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv + store i8 %mul, ptr %arrayidx3, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret void +} + +attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" } + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 16} +!3 = !{!"llvm.loop.interleave.count", i32 2} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;.