diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index febdc54e666a9..1ea8a200cd158 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -249,6 +249,11 @@ static cl::opt ForceTailFoldingStyle( "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask."))); +cl::opt llvm::EnableWideActiveLaneMask( + "enable-wide-lane-mask", cl::init(false), cl::Hidden, + cl::desc("Enable use of wide lane masks when used for control flow in " + "tail-folded loops")); + static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -1286,6 +1291,12 @@ class LoopVectorizationCostModel { return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; } + /// Returns true if tail-folding is preferred over a scalar epilogue. + bool preferPredicatedLoop() const { + return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate || + ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate; + } + /// Returns the TailFoldingStyle that is best for the current loop. TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { if (!ChosenTailFoldingStyle) @@ -1346,6 +1357,17 @@ class LoopVectorizationCostModel { return getTailFoldingStyle() != TailFoldingStyle::None; } + /// Returns true if the use of wide lane masks is requested and the loop is + /// using tail-folding with a lane mask for control flow. + bool useWideActiveLaneMask() const { + if (!EnableWideActiveLaneMask) + return false; + + TailFoldingStyle TF = getTailFoldingStyle(); + return TF == TailFoldingStyle::DataAndControlFlow || + TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; + } + /// Return maximum safe number of elements to be processed per vector /// iteration, which do not prevent store-load forwarding and are safe with /// regard to the memory dependencies. Required for EVL-based VPlans to @@ -4518,7 +4540,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - if (!CM.isScalarEpilogueAllowed()) + // Only interleave tail-folded loops if wide lane masks are requested, as the + // overhead of multiple instructions to calculate the predicate is likely + // not beneficial. If a scalar epilogue is not allowed for any other reason, + // do not interleave. + if (!CM.isScalarEpilogueAllowed() && + !(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask())) return 1; if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 64bbe630e3172..300fe7fe06f9f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -40,10 +40,6 @@ using namespace llvm; using namespace VPlanPatternMatch; -static cl::opt EnableWideActiveLaneMask( - "enable-wide-lane-mask", cl::init(false), cl::Hidden, - cl::desc("Enable use of wide get active lane mask instructions")); - bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( VPlan &Plan, function_ref diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index b28559b620e13..34850743e7b62 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -32,6 +32,7 @@ class VPRecipeBuilder; struct VFRange; extern cl::opt VerifyEachVPlan; +extern cl::opt EnableWideActiveLaneMask; struct VPlanTransforms { /// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll index f2e3b708d7820..61da142ad376c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll @@ -1,6 +1,8 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^middle.block:" --version 4 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=1 < %s | FileCheck %s -check-prefix CHECK-UF1 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4 +; RUN: opt -S --passes=loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize < %s | FileCheck %s -check-prefix CHECK-TF +; RUN: opt -S --passes=forceattrs,loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-attribute=optsize < %s | FileCheck %s -check-prefix CHECK-UF1 target triple = "aarch64-unknown-linux" @@ -101,6 +103,49 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src, ; CHECK-UF4-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-UF4: middle.block: ; +; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask( +; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32 +; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 5 +; CHECK-TF-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]] +; CHECK-TF-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]] +; CHECK-TF-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 16) +; CHECK-TF-NEXT: [[TMP8:%.*]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 4 +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP11]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP9]], [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], [[ACTIVE_LANE_MASK1]], poison) +; CHECK-TF-NEXT: [[TMP13:%.*]] = mul [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-TF-NEXT: [[TMP14:%.*]] = mul [[WIDE_MASKED_LOAD2]], splat (i8 3) +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 4 +; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 [[TMP17]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP13]], ptr align 1 [[TMP15]], [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP14]], ptr align 1 [[TMP18]], [[ACTIVE_LANE_MASK1]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-TF-NEXT: [[TMP19]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_NEXT]], i64 16) +; CHECK-TF-NEXT: [[TMP20]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-TF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true +; CHECK-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; entry: br label %for.body @@ -222,6 +267,52 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl ; CHECK-UF4-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-UF4: middle.block: ; +; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask_double( +; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[CMP6:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-TF-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-TF: for.body.preheader: +; CHECK-TF-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; CHECK-TF-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]] +; CHECK-TF-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]] +; CHECK-TF-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; CHECK-TF-NEXT: [[TMP8:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1 +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP9]], i64 [[TMP11]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP9]], [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP12]], [[ACTIVE_LANE_MASK1]], poison) +; CHECK-TF-NEXT: [[TMP13:%.*]] = fmul [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00) +; CHECK-TF-NEXT: [[TMP14:%.*]] = fmul [[WIDE_MASKED_LOAD2]], splat (double 3.000000e+00) +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 1 +; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP17]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP13]], ptr align 8 [[TMP15]], [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP14]], ptr align 8 [[TMP18]], [[ACTIVE_LANE_MASK1]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-TF-NEXT: [[TMP19]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; CHECK-TF-NEXT: [[TMP20]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-TF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true +; CHECK-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-TF: middle.block: +; entry: %cmp6 = icmp sgt i64 %n, 0 br i1 %cmp6, label %for.body, label %for.end @@ -243,14 +334,3 @@ for.end: attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" } -;. -; CHECK-UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK-UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} -;. -; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} -;.