diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 7ac4010b7b320..3bc2dfd623777 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -58,13 +58,21 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { VPValue *A, *B; + auto m_CanonicalScalarIVSteps = + m_ScalarIVSteps(m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()), + m_One(), m_Specific(&Plan.getVF())); + if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One()))) return B == Plan.getTripCount() && - (match(A, - m_ScalarIVSteps( - m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()), - m_One(), m_Specific(&Plan.getVF()))) || - IsWideCanonicalIV(A)); + (match(A, m_CanonicalScalarIVSteps) || IsWideCanonicalIV(A)); + + // For scalar plans, the header mask uses the scalar steps. + if (match(V, m_ICmp(m_CanonicalScalarIVSteps, + m_Specific(Plan.getBackedgeTakenCount())))) { + assert(Plan.hasScalarVFOnly() && + "Non-scalar VF using scalar IV steps for header mask?"); + return true; + } return match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) && B == Plan.getBackedgeTakenCount(); diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll index 3bc5da155b351..277be4666243b 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -pass-remarks='loop-vectorize' -disable-output -S 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS -; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -S | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -pass-remarks='loop-vectorize' -disable-output -S 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS +; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s ; These tests are to check that fold-tail procedure produces correct scalar code when ; loop-vectorization is only unrolling but not vectorizing. @@ -141,5 +141,61 @@ for.body: %cond = icmp eq ptr %ptr, %ptr2 br i1 %cond, label %for.cond.cleanup, label %for.body } + +define i64 @live_out_scalar_vf(i64 %n) { +; CHECK-LABEL: @live_out_scalar_vf( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4) +; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4) +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[STEP_ADD_3]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[STEP_ADD_3]], i32 2 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[EXITVAL:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV]], [[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[TMP19:%.*]] = phi i64 [ [[EXITVAL]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[TMP19]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + ; Need to use a phi otherwise the header mask will use a + ; VPWidenCanonicalIVRecipe instead of a VPScalarIVStepsRecipe. + %exitval = phi i64 [ 0, %entry ], [ %iv, %loop ] + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, %n + br i1 %ec, label %exit, label %loop + +exit: + ret i64 %exitval +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK-REMARKS: {{.*}}