From 876266a7bc8294bc727c60928df0c62781788700 Mon Sep 17 00:00:00 2001 From: Rin Dobrescu Date: Thu, 5 Oct 2023 15:29:10 +0000 Subject: [PATCH 1/4] Squashed commit of the following: commit 9c2faf15231ac5ebc168161d1731feed55eb177c Merge: 0a0ac8da5df6 baecc9e997dd Author: Rin Date: Thu Oct 5 11:19:13 2023 +0100 Merge branch 'main' into maxTC_tailBase commit 0a0ac8da5df684b865d0fb16f7a806832f37e05b Author: Rin Dobrescu Date: Thu Sep 28 15:48:49 2023 +0000 [AArch64][LoopVectorize] Use upper bound trip count instead of the constant TC when choosing max VF commit 26e009c770ace2c75d8b8c49bde8cfd7af911f13 Author: Rin Dobrescu Date: Thu Sep 28 10:30:39 2023 +0000 Remove 'assertions automatically generated' line from test commit e05612963bae4a4922a0b90e4b1d51382f202e4e Author: Rin Dobrescu Date: Wed Sep 27 14:47:42 2023 +0000 Address comments and fix tests commit 1bf78c81c678de436795a07c0b62037f782aa53f Author: Rin Dobrescu Date: Mon Sep 25 11:34:15 2023 +0000 [AArch64][LoopVectorize] Use either fixed-width or scalable VF when tail-folding --- .../Transforms/Vectorize/LoopVectorize.cpp | 65 ++++++++++--------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 53ad37bf3599b..26bf92d7d7c02 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1663,17 +1663,17 @@ class LoopVectorizationCostModel { /// disabled or unsupported, then the scalable part will be equal to /// ElementCount::getScalable(0). FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, + unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking); /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. /// This is a helper function of computeFeasibleMaxVF. - ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, - unsigned SmallestType, - unsigned WidestType, - ElementCount MaxSafeVF, - bool FoldTailByMasking); + ElementCount + getMaximizedVFForTarget(unsigned ConstTripCount, unsigned MaxTripCount, + unsigned SmallestType, unsigned WidestType, + ElementCount MaxSafeVF, bool FoldTailByMasking); /// \return the maximum legal scalable VF, based on the safe max number /// of elements. @@ -4811,7 +4811,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { } FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( - unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { + unsigned ConstTripCount, unsigned MaxTripCount, ElementCount UserVF, + bool FoldTailByMasking) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -4898,14 +4899,14 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( FixedScalableVFPair Result(ElementCount::getFixed(1), ElementCount::getScalable(0)); - if (auto MaxVF = - getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, - MaxSafeFixedVF, FoldTailByMasking)) + if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, MaxTripCount, + SmallestType, WidestType, + MaxSafeFixedVF, FoldTailByMasking)) Result.FixedVF = MaxVF; - if (auto MaxVF = - getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, - MaxSafeScalableVF, FoldTailByMasking)) + if (auto MaxVF = getMaximizedVFForTarget( + ConstTripCount, MaxTripCount, SmallestType, WidestType, + MaxSafeScalableVF, FoldTailByMasking)) if (MaxVF.isScalable()) { Result.ScalableVF = MaxVF; LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF @@ -4928,6 +4929,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); + unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); if (TC == 1) { reportVectorizationFailure("Single iteration (non) loop", @@ -4938,7 +4940,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return computeFeasibleMaxVF(TC, UserVF, false); + return computeFeasibleMaxVF(TC, MaxTC, UserVF, false); case CM_ScalarEpilogueNotAllowedUsePredicate: [[fallthrough]]; case CM_ScalarEpilogueNotNeededUsePredicate: @@ -4976,7 +4978,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return computeFeasibleMaxVF(TC, UserVF, false); + return computeFeasibleMaxVF(TC, MaxTC, UserVF, false); } return FixedScalableVFPair::getNone(); } @@ -4993,7 +4995,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); + FixedScalableVFPair MaxFactors = + computeFeasibleMaxVF(TC, MaxTC, UserVF, true); // Avoid tail folding if the trip count is known to be a multiple of any VF // we choose. @@ -5069,8 +5072,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( - unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, - ElementCount MaxSafeVF, bool FoldTailByMasking) { + unsigned ConstTripCount, unsigned MaxTripCount, unsigned SmallestType, + unsigned WidestType, ElementCount MaxSafeVF, bool FoldTailByMasking) { bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); const TypeSize WidestRegister = TTI.getRegisterBitWidth( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector @@ -5108,24 +5111,24 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( } // When a scalar epilogue is required, at least one iteration of the scalar - // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a + // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a // max VF that results in a dead vector loop. - if (ConstTripCount > 0 && requiresScalarEpilogue(true)) - ConstTripCount -= 1; - - if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC && - (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { - // If loop trip count (TC) is known at compile time there is no point in - // choosing VF greater than TC (as done in the loop below). Select maximum - // power of two which doesn't exceed TC. - // If MaxVectorElementCount is scalable, we only fall back on a fixed VF - // when the TC is less than or equal to the known number of lanes. - auto ClampedConstTripCount = llvm::bit_floor(ConstTripCount); + if (MaxTripCount > 0 && requiresScalarEpilogue(true)) + MaxTripCount -= 1; + + if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && + (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { + // If upper bound loop trip count (TC) is known at compile time there is no + // point in choosing VF greater than TC (as done in the loop below). Select + // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is + // scalable, we only fall back on a fixed VF when the TC is less than or + // equal to the known number of lanes. + auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount); LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " "exceeding the constant trip count: " - << ClampedConstTripCount << "\n"); + << ClampedUpperTripCount << "\n"); return ElementCount::get( - ClampedConstTripCount, + ClampedUpperTripCount, FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); } From b1708767b3771f6e3922f6fd64ddf8dfa8f1349f Mon Sep 17 00:00:00 2001 From: Rin Dobrescu Date: Fri, 6 Oct 2023 12:36:40 +0000 Subject: [PATCH 2/4] Remove constant TC from getMaximizedVFForTarget and computeFeasibleMaxVF functions and add test --- .../Transforms/Vectorize/LoopVectorize.cpp | 38 +++++++++---------- .../LoopVectorize/AArch64/wide-trip-count.ll | 33 ++++++++++++++++ 2 files changed, 51 insertions(+), 20 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/wide-trip-count.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 26bf92d7d7c02..8375190dddbcc 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1662,18 +1662,18 @@ class LoopVectorizationCostModel { /// elements is a power-of-2 larger than zero. If scalable vectorization is /// disabled or unsupported, then the scalable part will be equal to /// ElementCount::getScalable(0). - FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, - unsigned MaxTripCount, + FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking); /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. /// This is a helper function of computeFeasibleMaxVF. - ElementCount - getMaximizedVFForTarget(unsigned ConstTripCount, unsigned MaxTripCount, - unsigned SmallestType, unsigned WidestType, - ElementCount MaxSafeVF, bool FoldTailByMasking); + ElementCount getMaximizedVFForTarget(unsigned MaxTripCount, + unsigned SmallestType, + unsigned WidestType, + ElementCount MaxSafeVF, + bool FoldTailByMasking); /// \return the maximum legal scalable VF, based on the safe max number /// of elements. @@ -4811,8 +4811,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { } FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( - unsigned ConstTripCount, unsigned MaxTripCount, ElementCount UserVF, - bool FoldTailByMasking) { + unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -4899,14 +4898,14 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( FixedScalableVFPair Result(ElementCount::getFixed(1), ElementCount::getScalable(0)); - if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, MaxTripCount, - SmallestType, WidestType, - MaxSafeFixedVF, FoldTailByMasking)) + if (auto MaxVF = + getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, + MaxSafeFixedVF, FoldTailByMasking)) Result.FixedVF = MaxVF; - if (auto MaxVF = getMaximizedVFForTarget( - ConstTripCount, MaxTripCount, SmallestType, WidestType, - MaxSafeScalableVF, FoldTailByMasking)) + if (auto MaxVF = + getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, + MaxSafeScalableVF, FoldTailByMasking)) if (MaxVF.isScalable()) { Result.ScalableVF = MaxVF; LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF @@ -4940,7 +4939,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return computeFeasibleMaxVF(TC, MaxTC, UserVF, false); + return computeFeasibleMaxVF(MaxTC, UserVF, false); case CM_ScalarEpilogueNotAllowedUsePredicate: [[fallthrough]]; case CM_ScalarEpilogueNotNeededUsePredicate: @@ -4978,7 +4977,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return computeFeasibleMaxVF(TC, MaxTC, UserVF, false); + return computeFeasibleMaxVF(MaxTC, UserVF, false); } return FixedScalableVFPair::getNone(); } @@ -4995,8 +4994,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - FixedScalableVFPair MaxFactors = - computeFeasibleMaxVF(TC, MaxTC, UserVF, true); + FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true); // Avoid tail folding if the trip count is known to be a multiple of any VF // we choose. @@ -5072,8 +5070,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( - unsigned ConstTripCount, unsigned MaxTripCount, unsigned SmallestType, - unsigned WidestType, ElementCount MaxSafeVF, bool FoldTailByMasking) { + unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, + ElementCount MaxSafeVF, bool FoldTailByMasking) { bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); const TypeSize WidestRegister = TTI.getRegisterBitWidth( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wide-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wide-trip-count.ll new file mode 100644 index 0000000000000..f501705feb88b --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/wide-trip-count.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -S < %s -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve 2>&1 | FileCheck %s + +define void @wide_tc_8(ptr nocapture %dst, i32 %n, i64 %val){ +; CHECK-LABEL: define void @wide_tc_8( +; CHECK: call void @llvm.masked.store.nxv8i8.p0( {{.*}}, ptr {{.*}}, i32 1, {{.*}}) + +entry: + %rem = and i32 %n, 63 + %cmp8.not = icmp eq i32 %rem, 0 + br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %add = add nuw nsw i32 %rem, 7 + %shr = lshr i32 %add, 3 + %wide.trip.count = zext i32 %shr to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %p_out_tail.09 = phi ptr [ %dst, %for.body.preheader ], [ %incdec.ptr, %for.body ] + %0 = shl nuw nsw i64 %indvars.iv, 3 + %shr3 = lshr i64 %val, %0 + %conv4 = trunc i64 %shr3 to i8 + store i8 %conv4, ptr %p_out_tail.09, align 1 + %incdec.ptr = getelementptr inbounds i8, ptr %p_out_tail.09, i64 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void +} From afe149cf57fa67e061bf491af9ede9e7c857724b Mon Sep 17 00:00:00 2001 From: Rin Dobrescu Date: Mon, 9 Oct 2023 10:14:40 +0000 Subject: [PATCH 3/4] Add test in clamped-trip-count.ll --- .../AArch64/clamped-trip-count.ll | 31 +++++++++++++++++ .../LoopVectorize/AArch64/wide-trip-count.ll | 33 ------------------- 2 files changed, 31 insertions(+), 33 deletions(-) delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/wide-trip-count.ll diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll index 29a64d70a3635..26dc818d3c1ea 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll @@ -21,3 +21,34 @@ for.body: ; preds = %entry, %for.body for.cond.cleanup: ; preds = %for.body ret void } + +define void @wide_tc_8(ptr nocapture %dst, i32 %n, i64 %val){ +; CHECK-LABEL: define void @wide_tc_8( +; CHECK: call void @llvm.masked.store.nxv8i8.p0( {{.*}}, ptr {{.*}}, i32 1, {{.*}}) + +entry: + %rem = and i32 %n, 63 + %cmp8.not = icmp eq i32 %rem, 0 + br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %add = add nuw nsw i32 %rem, 7 + %shr = lshr i32 %add, 3 + %wide.trip.count = zext i32 %shr to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %p_out_tail.09 = phi ptr [ %dst, %for.body.preheader ], [ %incdec.ptr, %for.body ] + %0 = shl nuw nsw i64 %indvars.iv, 3 + %shr3 = lshr i64 %val, %0 + %conv4 = trunc i64 %shr3 to i8 + store i8 %conv4, ptr %p_out_tail.09, align 1 + %incdec.ptr = getelementptr inbounds i8, ptr %p_out_tail.09, i64 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wide-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wide-trip-count.ll deleted file mode 100644 index f501705feb88b..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/AArch64/wide-trip-count.ll +++ /dev/null @@ -1,33 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -S < %s -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve 2>&1 | FileCheck %s - -define void @wide_tc_8(ptr nocapture %dst, i32 %n, i64 %val){ -; CHECK-LABEL: define void @wide_tc_8( -; CHECK: call void @llvm.masked.store.nxv8i8.p0( {{.*}}, ptr {{.*}}, i32 1, {{.*}}) - -entry: - %rem = and i32 %n, 63 - %cmp8.not = icmp eq i32 %rem, 0 - br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader - -for.body.preheader: ; preds = %entry - %add = add nuw nsw i32 %rem, 7 - %shr = lshr i32 %add, 3 - %wide.trip.count = zext i32 %shr to i64 - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] - %p_out_tail.09 = phi ptr [ %dst, %for.body.preheader ], [ %incdec.ptr, %for.body ] - %0 = shl nuw nsw i64 %indvars.iv, 3 - %shr3 = lshr i64 %val, %0 - %conv4 = trunc i64 %shr3 to i8 - store i8 %conv4, ptr %p_out_tail.09, align 1 - %incdec.ptr = getelementptr inbounds i8, ptr %p_out_tail.09, i64 1 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body - -for.cond.cleanup: ; preds = %for.body - ret void -} From 8737b7618d9526f6aa54906d2f4c8d9c14d7686a Mon Sep 17 00:00:00 2001 From: Rin Dobrescu Date: Mon, 9 Oct 2023 13:32:29 +0000 Subject: [PATCH 4/4] Rename test --- .../Transforms/LoopVectorize/AArch64/clamped-trip-count.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll index 26dc818d3c1ea..4b3d2cd90013d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll @@ -22,8 +22,8 @@ for.cond.cleanup: ; preds = %for.body ret void } -define void @wide_tc_8(ptr nocapture %dst, i32 %n, i64 %val){ -; CHECK-LABEL: define void @wide_tc_8( +define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val){ +; CHECK-LABEL: define void @clamped_tc_max_8( ; CHECK: call void @llvm.masked.store.nxv8i8.p0( {{.*}}, ptr {{.*}}, i32 1, {{.*}}) entry: