-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[VPlan] Delay adding canonical IV increment. #82270
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 5 commits
2ae1d69
470e0ae
311f105
fbf1e06
53b812d
304f11b
4afb1f0
6af2b50
e1cd5d9
9770a1e
62e78e9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -7639,6 +7639,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( | |||||
|
||||||
// TODO: Move to VPlan transform stage once the transition to the VPlan-based | ||||||
// cost model is complete for better cost estimates. | ||||||
TailFoldingStyle Style = CM.getTailFoldingStyle( | ||||||
!isIndvarOverflowCheckKnownFalse(&CM, BestVF, BestUF)); | ||||||
// When not folding the tail, we know that the induction increment will not | ||||||
// overflow. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Otherwise, when folding tail, the induction increment may always overflow? Perhaps consider above isIndvarOverflowCheckKnownFalse()? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current code just moves the existing logic. Put up #111758 to improve this separately. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PR has now landed, updated the code here. |
||||||
bool HasNUW = Style == TailFoldingStyle::None; | ||||||
bool WithoutRuntimeCheck = | ||||||
Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; | ||||||
VPlanTransforms::lowerCanonicalIV(BestVPlan, HasNUW, WithoutRuntimeCheck); | ||||||
VPlanTransforms::unrollByUF(BestVPlan, BestUF, | ||||||
OrigLoop->getHeader()->getContext()); | ||||||
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); | ||||||
|
@@ -8768,29 +8776,25 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, | |||||
} | ||||||
} | ||||||
|
||||||
// Add the necessary canonical IV and branch recipes required to control the | ||||||
// loop. | ||||||
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, | ||||||
DebugLoc DL) { | ||||||
// Add the required canonical IV. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done thanks |
||||||
static void addCanonicalIV(VPlan &Plan, Type *IdxTy, DebugLoc DL) { | ||||||
Value *StartIdx = ConstantInt::get(IdxTy, 0); | ||||||
auto *StartV = Plan.getOrAddLiveIn(StartIdx); | ||||||
|
||||||
// Add a VPCanonicalIVPHIRecipe starting at 0 to the header. | ||||||
// TODO: Introduce a separate scalar phi recipe that can be used for codegen, | ||||||
// turning VPCanonicalIVPHIRecipe into an 'abstract' recipe which cannot be | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. VPCanonicalIVPHIRecipe, as a phi recipe in the header block, and BranchOnCount as a recipe in the latch, seem to retain their individual concrete semantics, even if the increment is introduced later between them. |
||||||
// executed directly. | ||||||
auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); | ||||||
VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); | ||||||
VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); | ||||||
Header->insert(CanonicalIVPHI, Header->begin()); | ||||||
|
||||||
VPBuilder Builder(TopRegion->getExitingBasicBlock()); | ||||||
// Add a VPInstruction to increment the scalar canonical IV by VF * UF. | ||||||
auto *CanonicalIVIncrement = Builder.createOverflowingOp( | ||||||
Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL, | ||||||
"index.next"); | ||||||
CanonicalIVPHI->addOperand(CanonicalIVIncrement); | ||||||
|
||||||
// Add the BranchOnCount VPInstruction to the latch. | ||||||
VPBuilder Builder(TopRegion->getExitingBasicBlock()); | ||||||
// TODO: introduce branch-on-count during VPlan final (pre-codegen) lowering. | ||||||
Builder.createNaryOp(VPInstruction::BranchOnCount, | ||||||
{CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); | ||||||
{CanonicalIVPHI, &Plan.getVectorTripCount()}, DL); | ||||||
Comment on lines
+8933
to
+8935
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about having BranchOnCount also be responsible for bumping the IV, at least initially? (Inspired by PowerPC's bdnz instruction; can call it BranchOnIncrementCount.) It could then feed back the canonical IV Phi across the back-edge, and possibly be split into a separate Add later to simplify code-gen. |
||||||
} | ||||||
|
||||||
/// Create resume phis in the scalar preheader for first-order recurrences and | ||||||
|
@@ -9038,10 +9042,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { | |||||
|
||||||
DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); | ||||||
TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); | ||||||
// When not folding the tail, we know that the induction increment will not | ||||||
// overflow. | ||||||
bool HasNUW = Style == TailFoldingStyle::None; | ||||||
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); | ||||||
addCanonicalIV(*Plan, Legal->getWidestInductionType(), DL); | ||||||
|
||||||
VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); | ||||||
|
||||||
|
@@ -9279,11 +9280,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { | |||||
Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); | ||||||
Term->eraseFromParent(); | ||||||
|
||||||
// Tail folding is not supported for outer loops, so the induction increment | ||||||
// is guaranteed to not wrap. | ||||||
bool HasNUW = true; | ||||||
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, | ||||||
DebugLoc()); | ||||||
addCanonicalIV(*Plan, Legal->getWidestInductionType(), DebugLoc()); | ||||||
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); | ||||||
return Plan; | ||||||
} | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3122,7 +3122,8 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { | |
|
||
VPCanonicalIVPHIRecipe *clone() override { | ||
auto *R = new VPCanonicalIVPHIRecipe(getOperand(0), getDebugLoc()); | ||
R->addOperand(getBackedgeValue()); | ||
if (getNumOperands() == 2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems reasonable to expect every header phi recipe to have two operands (always) - implementing getStartValue() and getBackedgeValue() of VPHeaderPhiRecipe? |
||
R->addOperand(getBackedgeValue()); | ||
return R; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -1289,19 +1289,12 @@ void VPlanTransforms::optimize(VPlan &Plan) { | |||||
// %Negated = Not %ALM | ||||||
// branch-on-cond %Negated | ||||||
// | ||||||
static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( | ||||||
VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { | ||||||
static VPActiveLaneMaskPHIRecipe *createActiveLaneMaskPhi(VPlan &Plan) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Above documentation should be kept in sync. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated, thanks |
||||||
VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); | ||||||
VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); | ||||||
auto *CanonicalIVPHI = Plan.getCanonicalIV(); | ||||||
VPValue *StartV = CanonicalIVPHI->getStartValue(); | ||||||
|
||||||
auto *CanonicalIVIncrement = | ||||||
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue()); | ||||||
// TODO: Check if dropping the flags is needed if | ||||||
// !DataAndControlFlowWithoutRuntimeCheck. | ||||||
CanonicalIVIncrement->dropPoisonGeneratingFlags(); | ||||||
DebugLoc DL = CanonicalIVIncrement->getDebugLoc(); | ||||||
DebugLoc DL = CanonicalIVPHI->getDebugLoc(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is for an increment placed in the preheader, so ok to use the DL of the phi instead of that of the in-loop/backedged-value increment? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes I think so, the DL from the CanonicalIV should be the closest accurate debug location |
||||||
// We can't use StartV directly in the ActiveLaneMask VPInstruction, since | ||||||
// we have to take unrolling into account. Each part needs to start at | ||||||
// Part * VF | ||||||
|
@@ -1311,21 +1304,6 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( | |||||
// Create the ActiveLaneMask instruction using the correct start values. | ||||||
VPValue *TC = Plan.getTripCount(); | ||||||
|
||||||
VPValue *TripCount, *IncrementValue; | ||||||
if (!DataAndControlFlowWithoutRuntimeCheck) { | ||||||
// When the loop is guarded by a runtime overflow check for the loop | ||||||
// induction variable increment by VF, we can increment the value before | ||||||
// the get.active.lane mask and use the unmodified tripcount. | ||||||
IncrementValue = CanonicalIVIncrement; | ||||||
TripCount = TC; | ||||||
} else { | ||||||
// When avoiding a runtime check, the active.lane.mask inside the loop | ||||||
// uses a modified trip count and the induction variable increment is | ||||||
// done after the active.lane.mask intrinsic is called. | ||||||
IncrementValue = CanonicalIVPHI; | ||||||
TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF, | ||||||
{TC}, DL); | ||||||
} | ||||||
auto *EntryIncrement = Builder.createOverflowingOp( | ||||||
VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL, | ||||||
"index.part.next"); | ||||||
|
@@ -1339,24 +1317,6 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( | |||||
// preheader ActiveLaneMask instruction. | ||||||
auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); | ||||||
LaneMaskPhi->insertAfter(CanonicalIVPHI); | ||||||
|
||||||
// Create the active lane mask for the next iteration of the loop before the | ||||||
// original terminator. | ||||||
VPRecipeBase *OriginalTerminator = EB->getTerminator(); | ||||||
Builder.setInsertPoint(OriginalTerminator); | ||||||
auto *InLoopIncrement = | ||||||
Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, | ||||||
{IncrementValue}, {false, false}, DL); | ||||||
auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, | ||||||
{InLoopIncrement, TripCount}, DL, | ||||||
"active.lane.mask.next"); | ||||||
LaneMaskPhi->addOperand(ALM); | ||||||
|
||||||
// Replace the original terminator with BranchOnCond. We have to invert the | ||||||
// mask here because a true condition means jumping to the exit block. | ||||||
auto *NotMask = Builder.createNot(ALM, DL); | ||||||
Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL); | ||||||
OriginalTerminator->eraseFromParent(); | ||||||
return LaneMaskPhi; | ||||||
} | ||||||
|
||||||
|
@@ -1422,8 +1382,7 @@ void VPlanTransforms::addActiveLaneMask( | |||||
cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser); | ||||||
VPSingleDefRecipe *LaneMask; | ||||||
if (UseActiveLaneMaskForControlFlow) { | ||||||
LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( | ||||||
Plan, DataAndControlFlowWithoutRuntimeCheck); | ||||||
LaneMask = createActiveLaneMaskPhi(Plan); | ||||||
} else { | ||||||
VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); | ||||||
LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, | ||||||
|
@@ -1575,6 +1534,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength( | |||||
|
||||||
auto *CanonicalIVPHI = Plan.getCanonicalIV(); | ||||||
VPValue *StartV = CanonicalIVPHI->getStartValue(); | ||||||
VPBasicBlock *Latch = Plan.getVectorLoopRegion()->getExitingBasicBlock(); | ||||||
|
||||||
// Create the ExplicitVectorLengthPhi recipe in the main loop. | ||||||
auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); | ||||||
|
@@ -1593,30 +1553,26 @@ bool VPlanTransforms::tryAddExplicitVectorLength( | |||||
auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL, | ||||||
DebugLoc()); | ||||||
|
||||||
auto *CanonicalIVIncrement = | ||||||
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue()); | ||||||
VPSingleDefRecipe *OpVPEVL = VPEVL; | ||||||
VPRecipeBase *LatchTerm = Latch->getTerminator(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Term may be ambiguous. Spell out There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated, thanks |
||||||
if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits(); | ||||||
IVSize != 32) { | ||||||
OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc | ||||||
: Instruction::ZExt, | ||||||
OpVPEVL, CanonicalIVPHI->getScalarType()); | ||||||
OpVPEVL->insertBefore(CanonicalIVIncrement); | ||||||
OpVPEVL->insertBefore(LatchTerm); | ||||||
} | ||||||
auto *NextEVLIV = | ||||||
new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi}, | ||||||
{CanonicalIVIncrement->hasNoUnsignedWrap(), | ||||||
CanonicalIVIncrement->hasNoSignedWrap()}, | ||||||
CanonicalIVIncrement->getDebugLoc(), "index.evl.next"); | ||||||
NextEVLIV->insertBefore(CanonicalIVIncrement); | ||||||
new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi}, {false, false}, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wrap flags initiated to false may later be turned on? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes at this point we don't know if it wraps or not. Without #111758, the flags for EVL increment would always be false I think, as it always folds the tail. With it, we could update it when introducing the increment (currently not done) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worth a comment and/or TODO? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added, thanks There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. Possibly worth commenting what each |
||||||
CanonicalIVPHI->getDebugLoc(), "index.evl.next"); | ||||||
NextEVLIV->insertBefore(LatchTerm); | ||||||
EVLPhi->addOperand(NextEVLIV); | ||||||
|
||||||
transformRecipestoEVLRecipes(Plan, *VPEVL); | ||||||
|
||||||
// Replace all uses of VPCanonicalIVPHIRecipe by | ||||||
// VPEVLBasedIVPHIRecipe except for the canonical IV increment. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should all uses be replaced now except for BranchOnCount? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes updated, thanks |
||||||
// VPEVLBasedIVPHIRecipe. | ||||||
CanonicalIVPHI->replaceAllUsesWith(EVLPhi); | ||||||
CanonicalIVIncrement->setOperand(0, CanonicalIVPHI); | ||||||
// TODO: support unroll factor > 1. | ||||||
Plan.setUF(1); | ||||||
return true; | ||||||
|
@@ -1794,3 +1750,76 @@ void VPlanTransforms::createInterleaveGroups( | |||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
void VPlanTransforms::lowerCanonicalIV( | ||||||
VPlan &Plan, bool HasNUW, bool DataAndControlFlowWithoutRuntimeCheck) { | ||||||
auto *CanIV = Plan.getCanonicalIV(); | ||||||
|
||||||
VPBasicBlock *EB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated, thanks |
||||||
auto *Term = EB->getTerminator(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated, thanks |
||||||
VPBuilder Builder(Term); | ||||||
DebugLoc DL = CanIV->getDebugLoc(); | ||||||
// Add a VPInstruction to increment the scalar canonical IV by VF * UF. | ||||||
auto *CanonicalIVIncrement = | ||||||
Builder.createOverflowingOp(Instruction::Add, {CanIV, &Plan.getVFxUF()}, | ||||||
{HasNUW, false}, DL, "index.next"); | ||||||
|
||||||
CanIV->addOperand(CanonicalIVIncrement); | ||||||
|
||||||
auto FoundLaneMaskPhi = find_if( | ||||||
Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), | ||||||
[](VPRecipeBase &P) { return isa<VPActiveLaneMaskPHIRecipe>(P); }); | ||||||
|
||||||
if (FoundLaneMaskPhi == | ||||||
Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis().end()) { | ||||||
|
||||||
// Update BranchOnCount VPInstruction in the latch to use increment. | ||||||
// TODO: Should have separate opcodes for separate semantics. | ||||||
Term->setOperand(0, CanonicalIVIncrement); | ||||||
return; | ||||||
} | ||||||
|
||||||
// Now introduce a conditional branch to control the loop until the lane mask | ||||||
// is exhuasted. | ||||||
auto *LaneMaskPhi = cast<VPActiveLaneMaskPHIRecipe>(&*FoundLaneMaskPhi); | ||||||
auto *VecPreheader = | ||||||
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()); | ||||||
Builder.setInsertPoint(VecPreheader); | ||||||
|
||||||
VPValue *TC = Plan.getTripCount(); | ||||||
|
||||||
// TODO: Check if dropping the flags is needed if | ||||||
// !DataAndControlFlowWithoutRuntimeCheck. | ||||||
CanonicalIVIncrement->dropPoisonGeneratingFlags(); | ||||||
VPValue *TripCount, *IncrementValue; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is either the pre-incremented or the post-incremented IV
Suggested change
("IncrementValue" itself is VF*UF, "TripCount" may be TripCount-VF(*UF)) |
||||||
if (!DataAndControlFlowWithoutRuntimeCheck) { | ||||||
// When the loop is guarded by a runtime overflow check for the loop | ||||||
// induction variable increment by VF, we can increment the value before | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
? |
||||||
// the get.active.lane mask and use the unmodified tripcount. | ||||||
Comment on lines
+2050
to
+2052
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worth clarifying the distinction between HasNUW and DataAndControlFlowWithoutRuntimeCheck - the former (also) seems to imply/suffice for the condition here? |
||||||
IncrementValue = CanonicalIVIncrement; | ||||||
TripCount = TC; | ||||||
} else { | ||||||
// When avoiding a runtime check, the active.lane.mask inside the loop | ||||||
// uses a modified trip count and the induction variable increment is | ||||||
// done after the active.lane.mask intrinsic is called. | ||||||
IncrementValue = CanIV; | ||||||
TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (independent) Should be |
||||||
{TC}, DL); | ||||||
} | ||||||
// Create the active lane mask for the next iteration of the loop before the | ||||||
// original terminator. | ||||||
Builder.setInsertPoint(EB); | ||||||
auto *InLoopIncrement = | ||||||
Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, | ||||||
{IncrementValue}, {false, false}, DL); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worth clarifying the distinction between CanonicalIVIncrement and CanonicalIVIncrementForPart: the former is the scalar value CanIV + VF * UF which is uniform across VF * UF and belongs to the scalar canonical IV chain, whereas the latter is the scalar value CanIV + (VF * UF) + P * UF which is uniform across VF and uses the canonical IV chain (either pre or post increment), as in "build scalar steps" per part P. In particular, the latter is useful only for UF>1, i.e., when unrolling. Right? |
||||||
auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, | ||||||
{InLoopIncrement, TripCount}, DL, | ||||||
"active.lane.mask.next"); | ||||||
LaneMaskPhi->addOperand(ALM); | ||||||
|
||||||
// Replace the original terminator with BranchOnCond. We have to invert the | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This now introduces the terminal rather than replace it. Perhaps a terminal branch-on-count recipe should be introduced along with the abstract canonical IV (could conceptually check the pre-bumped IV with TC-step), delaying only the introduction of the canonical IV's increment between them for later? The canonical IV still remains abstract until this increment is added, but the VPlan continues to be "valid" w/o updating verify(). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated to initially still introduce the branch early. At the moment it still uses the same opcode, but would probably need to introduce a separate one for the different semantics (or define them conditionally on whether lowering has been finalized) |
||||||
// mask here because a true condition means jumping to the exit block. | ||||||
auto *NotMask = Builder.createNot(ALM, DL); | ||||||
Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BranchOnCond operates on a single condition bit, so NotMask (only last part thereof?) should effectively be reduced using AnyOf reduction - although its execute/generate seems to look for the first lane (only), rather than the last? |
||||||
Term->eraseFromParent(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The existing terminator is already a branch on cond; reset its operand? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current version leaves the BranchOnCount until we replace it here. |
||||||
} |
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -123,6 +123,11 @@ struct VPlanTransforms { | |||||||
|
||||||||
/// Remove dead recipes from \p Plan. | ||||||||
static void removeDeadRecipes(VPlan &Plan); | ||||||||
|
||||||||
/// Finalize \p Plan by introducing explicit increments for the canonical | ||||||||
/// induction. | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
this is one step in getting Plan to be ready for code-gen, not necessarily the last one after which Plan has been "Finalized". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, we need to do this before unrolling, adjusted, thanks! |
||||||||
static void lowerCanonicalIV(VPlan &Plan, bool HasNUW, | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated, thanks |
||||||||
bool DataAndControlFlowWithoutRuntimeCheck); | ||||||||
}; | ||||||||
|
||||||||
} // namespace llvm | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -91,7 +91,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { | |
|
||
auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV(); | ||
// Canonical IV chain is uniform. | ||
if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue()) | ||
if (V == CanonicalIV) // || V == CanonicalIV->getBackedgeValue()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Continue to consider the backedge value uniform, if it exists? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done thanks! |
||
return true; | ||
|
||
return TypeSwitch<const VPRecipeBase *, bool>(R) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What does this comment refer to, should it move?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It refers to the VPlan transforms below, I moved down and clarified.