-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[LV] Don't require scalar epilogue for unsupported IAG with tail #96544
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
nikolaypanchenko
wants to merge
2
commits into
llvm:main
Choose a base branch
from
nikolaypanchenko:npanchen/pr/scalar_epilogue_iai_tail
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1446,24 +1446,42 @@ class LoopVectorizationCostModel { | |
|
||
/// Returns true if we're required to use a scalar epilogue for at least | ||
/// the final iteration of the original loop. | ||
bool requiresScalarEpilogue(bool IsVectorizing) const { | ||
bool requiresScalarEpilogue(ElementCount VF) const { | ||
if (!isScalarEpilogueAllowed()) { | ||
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); | ||
LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF | ||
<< " does not require scalar epilogue\n"); | ||
return false; | ||
} | ||
// If we might exit from anywhere but the latch, must run the exiting | ||
// iteration in scalar form. | ||
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { | ||
LLVM_DEBUG( | ||
dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n"); | ||
LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF | ||
<< " requires scalar epilogue: multiple exits\n"); | ||
return true; | ||
} | ||
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { | ||
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " | ||
"interleaved group requires scalar epilogue\n"); | ||
return true; | ||
if (VF.isVector()) { | ||
if (InterleaveInfo.requiresScalarEpilogue()) { | ||
Mel-Chen marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Make sure interleaved groups that require scalar epilogue will be | ||
// widened. | ||
if (any_of(InterleaveInfo.getInterleaveGroups(), [&](auto *Group) { | ||
if (!Group->requiresScalarEpilogue()) | ||
return false; | ||
|
||
Instruction *I = Group->getMember(0); | ||
InstWidening Decision = getWideningDecision(I, VF); | ||
return Decision == CM_Interleave || | ||
(Decision == CM_Unknown && | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible that the decision later changes and the result of |
||
interleavedAccessCanBeWidened(I, VF)); | ||
})) { | ||
LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF | ||
<< " requires scalar epilogue: interleaved group " | ||
"requires scalar epilogue\n"); | ||
return true; | ||
} | ||
} | ||
} | ||
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); | ||
LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF | ||
<< " does not require scalar epilogue\n"); | ||
return false; | ||
} | ||
|
||
|
@@ -1473,7 +1491,7 @@ class LoopVectorizationCostModel { | |
/// none. | ||
bool requiresScalarEpilogue(VFRange Range) const { | ||
auto RequiresScalarEpilogue = [this](ElementCount VF) { | ||
return requiresScalarEpilogue(VF.isVector()); | ||
return requiresScalarEpilogue(VF); | ||
}; | ||
bool IsRequired = all_of(Range, RequiresScalarEpilogue); | ||
assert( | ||
|
@@ -2770,7 +2788,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { | |
// the step does not evenly divide the trip count, no adjustment is necessary | ||
// since there will already be scalar iterations. Note that the minimum | ||
// iterations check ensures that N >= Step. | ||
if (Cost->requiresScalarEpilogue(VF.isVector())) { | ||
if (Cost->requiresScalarEpilogue(VF)) { | ||
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); | ||
R = Builder.CreateSelect(IsZero, Step, R); | ||
} | ||
|
@@ -2823,8 +2841,8 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { | |
// vector trip count is zero. This check also covers the case where adding one | ||
// to the backedge-taken count overflowed leading to an incorrect trip count | ||
// of zero. In this case we will also jump to the scalar loop. | ||
auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE | ||
: ICmpInst::ICMP_ULT; | ||
auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE | ||
: ICmpInst::ICMP_ULT; | ||
|
||
// If tail is to be folded, vector loop takes care of all iterations. | ||
Type *CountTy = Count->getType(); | ||
|
@@ -2873,7 +2891,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { | |
|
||
// Update dominator for Bypass & LoopExit (if needed). | ||
DT->changeImmediateDominator(Bypass, TCCheckBlock); | ||
if (!Cost->requiresScalarEpilogue(VF.isVector())) | ||
if (!Cost->requiresScalarEpilogue(VF)) | ||
// If there is an epilogue which must run, there's no edge from the | ||
// middle block to exit blocks and thus no need to update the immediate | ||
// dominator of the exit blocks. | ||
|
@@ -2902,7 +2920,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { | |
// Update dominator only if this is first RT check. | ||
if (LoopBypassBlocks.empty()) { | ||
DT->changeImmediateDominator(Bypass, SCEVCheckBlock); | ||
if (!Cost->requiresScalarEpilogue(VF.isVector())) | ||
if (!Cost->requiresScalarEpilogue(VF)) | ||
// If there is an epilogue which must run, there's no edge from the | ||
// middle block to exit blocks and thus no need to update the immediate | ||
// dominator of the exit blocks. | ||
|
@@ -2955,7 +2973,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { | |
LoopVectorPreHeader = OrigLoop->getLoopPreheader(); | ||
assert(LoopVectorPreHeader && "Invalid loop structure"); | ||
LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr | ||
assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && | ||
assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && | ||
"multiple exit loop without required epilogue?"); | ||
|
||
LoopMiddleBlock = | ||
|
@@ -2970,7 +2988,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { | |
// unconditional branch from the middle block to the scalar preheader. In that | ||
// case, there's no edge from the middle block to exit blocks and thus no | ||
// need to update the immediate dominator of the exit blocks. | ||
if (Cost->requiresScalarEpilogue(VF.isVector())) { | ||
if (Cost->requiresScalarEpilogue(VF)) { | ||
assert( | ||
LoopMiddleBlock->getSingleSuccessor() == LoopScalarPreHeader && | ||
" middle block should have the scalar preheader as single successor"); | ||
|
@@ -3103,7 +3121,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { | |
// Thus if tail is to be folded, we know we don't need to run the | ||
// remainder and we can use the previous value for the condition (true). | ||
// 3) Otherwise, construct a runtime check. | ||
if (!Cost->requiresScalarEpilogue(VF.isVector()) && | ||
if (!Cost->requiresScalarEpilogue(VF) && | ||
!Cost->foldTailByMasking()) { | ||
// Here we use the same DebugLoc as the scalar loop latch terminator instead | ||
// of the corresponding compare because they may have ended up with | ||
|
@@ -3413,7 +3431,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, | |
VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); | ||
VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock(); | ||
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); | ||
if (Cost->requiresScalarEpilogue(VF.isVector())) { | ||
if (Cost->requiresScalarEpilogue(VF)) { | ||
// No edge from the middle block to the unique exit block has been inserted | ||
// and there is nothing to fix from vector loop; phis should have incoming | ||
// from scalar loop only. | ||
|
@@ -4664,7 +4682,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( | |
// When a scalar epilogue is required, at least one iteration of the scalar | ||
// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a | ||
// max VF that results in a dead vector loop. | ||
if (MaxTripCount > 0 && requiresScalarEpilogue(true)) | ||
if (MaxTripCount > 0 && requiresScalarEpilogue(MaxVectorElementCount)) | ||
MaxTripCount -= 1; | ||
|
||
if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && | ||
|
@@ -5304,7 +5322,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, | |
// At least one iteration must be scalar when this constraint holds. So the | ||
// maximum available iterations for interleaving is one less. | ||
unsigned AvailableTC = | ||
requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC; | ||
requiresScalarEpilogue(VF) ? KnownTC - 1 : KnownTC; | ||
|
||
// If trip count is known we select between two prospective ICs, where | ||
// 1) the aggressive IC is capped by the trip count divided by VF | ||
|
@@ -5333,7 +5351,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, | |
} else if (BestKnownTC && *BestKnownTC > 0) { | ||
// At least one iteration must be scalar when this constraint holds. So the | ||
// maximum available iterations for interleaving is one less. | ||
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) | ||
unsigned AvailableTC = requiresScalarEpilogue(VF) | ||
? (*BestKnownTC) - 1 | ||
: *BestKnownTC; | ||
|
||
|
@@ -7640,8 +7658,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, | |
|
||
// Generate code to check if the loop's trip count is less than VF * UF of the | ||
// main vector loop. | ||
auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() | ||
: VF.isVector()) | ||
auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) | ||
? ICmpInst::ICMP_ULE | ||
: ICmpInst::ICMP_ULT; | ||
|
||
|
@@ -7663,7 +7680,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, | |
|
||
// Update dominator for Bypass & LoopExit. | ||
DT->changeImmediateDominator(Bypass, TCCheckBlock); | ||
if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) | ||
if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) | ||
// For loops with multiple exits, there's no edge from the middle block | ||
// to exit blocks (as the epilogue must run) and thus no need to update | ||
// the immediate dominator of the exit blocks. | ||
|
@@ -7732,7 +7749,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( | |
|
||
DT->changeImmediateDominator(LoopScalarPreHeader, | ||
EPI.EpilogueIterationCountCheck); | ||
if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) | ||
if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) | ||
// If there is an epilogue which must run, there's no edge from the | ||
// middle block to exit blocks and thus no need to update the immediate | ||
// dominator of the exit blocks. | ||
|
@@ -7814,9 +7831,8 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( | |
|
||
// Generate code to check if the loop's trip count is less than VF * UF of the | ||
// vector epilogue loop. | ||
auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) | ||
? ICmpInst::ICMP_ULE | ||
: ICmpInst::ICMP_ULT; | ||
auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? ICmpInst::ICMP_ULE | ||
: ICmpInst::ICMP_ULT; | ||
|
||
Value *CheckMinIters = | ||
Builder.CreateICmp(P, Count, | ||
|
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should not be needed to check for #groups.
requiresScalarEpilogue
is false when no groups were constructed.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, so the below code should be good enough: