diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index da99e8d51efdfc..27a1d7249a9f40 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -541,7 +541,7 @@ enum BasicBlockFlags : unsigned __int64 // Flags to update when two blocks are compacted BBF_COMPACT_UPD = BBF_GC_SAFE_POINT | BBF_HAS_JMP | BBF_HAS_IDX_LEN | BBF_HAS_MD_IDX_LEN | BBF_BACKWARD_JUMP | \ - BBF_HAS_NEWOBJ | BBF_HAS_NULLCHECK | BBF_HAS_MDARRAYREF, + BBF_HAS_NEWOBJ | BBF_HAS_NULLCHECK | BBF_HAS_MDARRAYREF | BBF_LOOP_PREHEADER, // Flags a block should not have had before it is split. diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index fe398e51fbdc34..b9199d49b646cc 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -5003,8 +5003,9 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // Dominator and reachability sets are no longer valid. // The loop table is no longer valid. - fgDomsComputed = false; - optLoopTableValid = false; + fgDomsComputed = false; + optLoopTableValid = false; + optLoopsRequirePreHeaders = false; #ifdef DEBUG DoPhase(this, PHASE_STRESS_SPLIT_TREE, &Compiler::StressSplitTree); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index e21307de79e919..b227a344410f72 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -5232,7 +5232,7 @@ class Compiler void vnPrint(ValueNum vn, unsigned level); #endif - bool fgDominate(BasicBlock* b1, BasicBlock* b2); // Return true if b1 dominates b2 + bool fgDominate(const BasicBlock* b1, const BasicBlock* b2); // Return true if b1 dominates b2 // Dominator computation member functions // Not exposed outside Compiler @@ -6193,7 +6193,7 @@ class Compiler bool optHoistLoopNest(unsigned lnum, LoopHoistContext* hoistCtxt); // Do hoisting for a particular loop ("lnum" is an index into the optLoopTable.) - bool optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, BasicBlockList* existingPreHeaders); + bool optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt); // Hoist all expressions in "blocks" that are invariant in loop "loopNum" (an index into the optLoopTable) // outside of that loop. @@ -6262,7 +6262,6 @@ class Compiler PhaseStatus optCloneLoops(); void optCloneLoop(unsigned loopInd, LoopCloneContext* context); - void optEnsureUniqueHead(unsigned loopInd, weight_t ambientWeight); PhaseStatus optUnrollLoops(); // Unrolls loops (needs to have cost info) void optRemoveRedundantZeroInits(); PhaseStatus optIfConversion(); // If conversion @@ -6339,8 +6338,6 @@ class Compiler int lpLoopVarFPCount; // The register count for the FP LclVars that are read/written inside this loop int lpVarInOutFPCount; // The register count for the FP LclVars that are alive inside or across this loop - bool lpHoistAddedPreheader; // The loop preheader was added during hoisting - typedef JitHashTable, FieldKindForVN> FieldHandleSet; FieldHandleSet* lpFieldsModified; // This has entries for all static field and object instance fields modified @@ -6512,10 +6509,11 @@ class Compiler bool fgHasLoops; // True if this method has any loops, set in fgComputeReachability public: - LoopDsc* optLoopTable; // loop descriptor table - bool optLoopTableValid; // info in loop table should be valid - unsigned char optLoopCount; // number of tracked loops - unsigned char loopAlignCandidates; // number of loops identified for alignment + LoopDsc* optLoopTable; // loop descriptor table + bool optLoopTableValid; // info in loop table should be valid + bool optLoopsRequirePreHeaders; // Do we require that all loops (in the loop table) have pre-headers? + unsigned char optLoopCount; // number of tracked loops + unsigned char loopAlignCandidates; // number of loops identified for alignment // Every time we rebuild the loop table, we increase the global "loop epoch". Any loop indices or // loop table pointers from the previous epoch are invalid. @@ -6575,7 +6573,7 @@ class Compiler bool optComputeIterInfo(GenTree* incr, BasicBlock* from, BasicBlock* to, unsigned* pIterVar); bool optPopulateInitInfo(unsigned loopInd, BasicBlock* initBlock, GenTree* init, unsigned iterVar); bool optExtractInitTestIncr( - BasicBlock* head, BasicBlock* bottom, BasicBlock* exit, GenTree** ppInit, GenTree** ppTest, GenTree** ppIncr); + BasicBlock** pInitBlock, BasicBlock* bottom, BasicBlock* exit, GenTree** ppInit, GenTree** ppTest, GenTree** ppIncr); void optFindNaturalLoops(); @@ -6642,7 +6640,7 @@ class Compiler bool optIsLoopEntry(BasicBlock* block) const; // The depth of the loop described by "lnum" (an index into the loop table.) (0 == top level) - unsigned optLoopDepth(unsigned lnum) + unsigned optLoopDepth(unsigned lnum) const { assert(lnum < optLoopCount); unsigned depth = 0; @@ -7650,7 +7648,7 @@ class Compiler bool optCheckLoopCloningGDVTestProfitable(GenTreeOp* guard, LoopCloneVisitorInfo* info); bool optIsHandleOrIndirOfHandle(GenTree* tree, GenTreeFlags handleType); - bool optLoopCloningEnabled(); + static bool optLoopCloningEnabled(); #ifdef DEBUG void optDebugLogLoopCloning(BasicBlock* block, Statement* insertBefore); diff --git a/src/coreclr/jit/fgbasic.cpp b/src/coreclr/jit/fgbasic.cpp index fe1f74c8b45138..868c02d0faf5ce 100644 --- a/src/coreclr/jit/fgbasic.cpp +++ b/src/coreclr/jit/fgbasic.cpp @@ -4971,6 +4971,8 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable) JITDUMP("fgRemoveBlock " FMT_BB ", unreachable=%s\n", block->bbNum, dspBool(unreachable)); + assert(unreachable || !optLoopsRequirePreHeaders || ((block->bbFlags & BBF_LOOP_PREHEADER) == 0)); + // If we've cached any mappings from switch blocks to SwitchDesc's (which contain only the // *unique* successors of the switch block), invalidate that cache, since an entry in one of // the SwitchDescs might be removed. diff --git a/src/coreclr/jit/fgdiagnostic.cpp b/src/coreclr/jit/fgdiagnostic.cpp index 176f10bc8cac1f..8a36ae86c9e941 100644 --- a/src/coreclr/jit/fgdiagnostic.cpp +++ b/src/coreclr/jit/fgdiagnostic.cpp @@ -4335,6 +4335,7 @@ void Compiler::fgDebugCheckSsa() // - All basic blocks with loop numbers set have a corresponding loop in the table // - All basic blocks without a loop number are not in a loop // - All parents of the loop with the block contain that block +// - If optLoopsRequirePreHeaders is true, the loop has a pre-header // - If the loop has a pre-header, it is valid // - The loop flags are valid // - no loop shares `top` with any of its children @@ -4593,6 +4594,11 @@ void Compiler::fgDebugCheckLoopTable() } } + if (optLoopsRequirePreHeaders) + { + assert((loop.lpFlags & LPFLG_HAS_PREHEAD) != 0); + } + // If the loop has a pre-header, ensure the pre-header form is correct. if ((loop.lpFlags & LPFLG_HAS_PREHEAD) != 0) { diff --git a/src/coreclr/jit/fgflow.cpp b/src/coreclr/jit/fgflow.cpp index cf91ff3d33b280..5f265ea783a8cf 100644 --- a/src/coreclr/jit/fgflow.cpp +++ b/src/coreclr/jit/fgflow.cpp @@ -361,27 +361,20 @@ void Compiler::fgRemoveBlockAsPred(BasicBlock* block) fgRemoveRefPred(bNext, bNext->bbPreds->getSourceBlock()); } } + fgRemoveRefPred(block->bbJumpDest, block); + break; - FALLTHROUGH; - - case BBJ_COND: case BBJ_ALWAYS: case BBJ_EHCATCHRET: - - /* Update the predecessor list for 'block->bbJumpDest' and 'block->bbNext' */ fgRemoveRefPred(block->bbJumpDest, block); - - if (block->bbJumpKind != BBJ_COND) - { - break; - } - - /* If BBJ_COND fall through */ - FALLTHROUGH; + break; case BBJ_NONE: + fgRemoveRefPred(block->bbNext, block); + break; - /* Update the predecessor list for 'block->bbNext' */ + case BBJ_COND: + fgRemoveRefPred(block->bbJumpDest, block); fgRemoveRefPred(block->bbNext, block); break; diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index f6cc772cd204a4..84e6ce0ef21d76 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -26,7 +26,7 @@ // Assumptions: // -- Dominators have been calculated (`fgDomsComputed` is true). // -bool Compiler::fgDominate(BasicBlock* b1, BasicBlock* b2) +bool Compiler::fgDominate(const BasicBlock* b1, const BasicBlock* b2) { noway_assert(fgDomsComputed); @@ -60,17 +60,6 @@ bool Compiler::fgDominate(BasicBlock* b1, BasicBlock* b2) if (b1->bbNum > fgDomBBcount) { - // If b1 is a loop preheader (that was created after the dominators were calculated), - // then it has a single successor that is the loop entry, and it is the only non-loop - // predecessor of the loop entry. Thus, b1 dominates the loop entry and also dominates - // what the loop entry dominates. - if (b1->bbFlags & BBF_LOOP_PREHEADER) - { - BasicBlock* loopEntry = b1->GetUniqueSucc(); - assert(loopEntry != nullptr); - return fgDominate(loopEntry, b2); - } - // unknown dominators; err on the safe side and return false return false; } @@ -1947,6 +1936,17 @@ bool Compiler::fgCanCompactBlocks(BasicBlock* block, BasicBlock* bNext) return false; } + // Don't allow removing an empty loop pre-header. + // We can compact a pre-header `bNext` into an empty `block` since BBF_COMPACT_UPD propagates + // BBF_LOOP_PREHEADER to `block`. + if (optLoopsRequirePreHeaders) + { + if (((block->bbFlags & BBF_LOOP_PREHEADER) != 0) && (bNext->countOfInEdges() != 1)) + { + return false; + } + } + // Don't compact the first block if it was specially created as a scratch block. if (fgBBisScratch(block)) { @@ -1983,16 +1983,14 @@ bool Compiler::fgCanCompactBlocks(BasicBlock* block, BasicBlock* bNext) } } - // We cannot compact a block that participates in loop - // alignment. + // We cannot compact a block that participates in loop alignment. // if ((bNext->countOfInEdges() > 1) && bNext->isLoopAlign()) { return false; } - // If we are trying to compact blocks from different loops - // that don't do it. + // Don't compact blocks from different loops. // if ((block->bbNatLoopNum != BasicBlock::NOT_IN_LOOP) && (bNext->bbNatLoopNum != BasicBlock::NOT_IN_LOOP) && (block->bbNatLoopNum != bNext->bbNatLoopNum)) @@ -2058,8 +2056,13 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext) JITDUMP("Second block has %u other incoming edges\n", bNext->countOfInEdges()); assert(block->isEmpty()); - // `block` can no longer be a loop pre-header (if it was before). + // When loops require pre-headers, `block` cannot be a pre-header. + // We should have screened this out in fgCanCompactBlocks(). + // + // When pre-headers are not required, then if `block` was a pre-header, + // it no longer is. // + assert(!optLoopsRequirePreHeaders || ((block->bbFlags & BBF_LOOP_PREHEADER) == 0)); block->bbFlags &= ~BBF_LOOP_PREHEADER; // Retarget all the other edges incident on bNext. Do this @@ -3053,6 +3056,15 @@ bool Compiler::fgOptimizeEmptyBlock(BasicBlock* block) break; } + // Don't delete empty loop pre-headers. + if (optLoopsRequirePreHeaders) + { + if ((block->bbFlags & BBF_LOOP_PREHEADER) != 0) + { + break; + } + } + /* special case if this is the last BB */ if (block == fgLastBB) { @@ -6027,8 +6039,9 @@ PhaseStatus Compiler::fgUpdateFlowGraphPhase() // Dominator and reachability sets are no longer valid. // The loop table is no longer valid. - fgDomsComputed = false; - optLoopTableValid = false; + fgDomsComputed = false; + optLoopTableValid = false; + optLoopsRequirePreHeaders = false; return madeChanges ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING; } diff --git a/src/coreclr/jit/loopcloning.cpp b/src/coreclr/jit/loopcloning.cpp index d6acb100e62580..0f28c613470919 100644 --- a/src/coreclr/jit/loopcloning.cpp +++ b/src/coreclr/jit/loopcloning.cpp @@ -1970,81 +1970,6 @@ BasicBlock* Compiler::optInsertLoopChoiceConditions(LoopCloneContext* context, return insertAfter; } -//------------------------------------------------------------------------ -// OptEnsureUniqueHead: Ensure that loop "loopInd" has a unique head block. -// If the existing entry has non-loop predecessors other than the head entry, -// create a new, empty block that goes (only) to the entry, and redirects the -// preds of the entry to this new block. Sets the weight of the newly created -// block to "ambientWeight". -// -// NOTE: this is currently dead code, because it is only called by loop cloning, -// and loop cloning only works with single-entry loops where the immediately -// preceding head block is the only predecessor of the loop entry. -// -// Arguments: -// loopInd - index of loop to process -// ambientWeight - weight to give the new head, if created. -// -void Compiler::optEnsureUniqueHead(unsigned loopInd, weight_t ambientWeight) -{ - LoopDsc& loop = optLoopTable[loopInd]; - - BasicBlock* h = loop.lpHead; - BasicBlock* t = loop.lpTop; - BasicBlock* e = loop.lpEntry; - BasicBlock* b = loop.lpBottom; - - // If "h" dominates the entry block, then it is the unique header. - if (fgDominate(h, e)) - { - return; - } - - // Otherwise, create a new empty header block, make it the pred of the entry block, - // and redirect the preds of the entry block to go to this. - - BasicBlock* beforeTop = t->bbPrev; - assert(!beforeTop->bbFallsThrough() || (beforeTop->bbNext == e)); - - // Make sure that the new block is in the same region as the loop. - // (We will only create loops that are entirely within a region.) - BasicBlock* h2 = fgNewBBafter(BBJ_NONE, beforeTop, /*extendRegion*/ true); - assert(beforeTop->bbNext == h2); - - // This is in the containing loop. - h2->bbNatLoopNum = loop.lpParent; - h2->bbWeight = h2->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; - - if (h2->bbNext != e) - { - h2->bbJumpKind = BBJ_ALWAYS; - h2->bbJumpDest = e; - } - BlockSetOps::Assign(this, h2->bbReach, e->bbReach); - - fgAddRefPred(e, h2); - - // Redirect paths from preds of "e" to go to "h2" instead of "e". - BlockToBlockMap* blockMap = new (getAllocator(CMK_LoopClone)) BlockToBlockMap(getAllocator(CMK_LoopClone)); - blockMap->Set(e, h2); - - for (BasicBlock* const predBlock : e->PredBlocks()) - { - // Skip if predBlock is in the loop. - if (t->bbNum <= predBlock->bbNum && predBlock->bbNum <= b->bbNum) - { - continue; - } - - optRedirectBlock(predBlock, blockMap); - - fgAddRefPred(h2, predBlock); - fgRemoveRefPred(e, predBlock); - } - - optUpdateLoopHead(loopInd, h, h2); -} - //------------------------------------------------------------------------ // optCloneLoop: Perform the mechanical cloning of the specified loop // @@ -2087,12 +2012,9 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) // the loop being cloned. unsigned char ambientLoop = loop.lpParent; - // First, make sure that the loop has a unique header block, creating an empty one if necessary. - optEnsureUniqueHead(loopInd, ambientWeight); - // We're going to transform this loop: // - // H --> E (or, H conditionally branches around the loop and has fall-through to T == E) + // H --> E (if T == E, H falls through to T/E) // T // E // B ?-> T @@ -2101,11 +2023,11 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) // to this pair of loops: // // H ?-> H3 (all loop failure conditions branch to new slow path loop head) - // H2--> E (Optional; if T == E, let H fall through to T/E) + // H2 --> E (if T == E, H2 will fall through to T/E) // T // E // B ?-> T - // X2--> X + // X2 --> X (only need this if B -> is conditional, not BBJ_ALWAYS) // H3 --> E2 (aka slowHead. Or, H3 falls through to T2 == E2) // T2 // E2 @@ -2113,28 +2035,38 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) // X BasicBlock* h = loop.lpHead; - if (!h->KindIs(BBJ_NONE, BBJ_ALWAYS)) - { - // Make a new block to be the unique entry to the loop. - JITDUMP("Create new unique single-successor entry to loop\n"); - assert((h->bbJumpKind == BBJ_COND) && (h->bbNext == loop.lpEntry)); - BasicBlock* newH = fgNewBBafter(BBJ_NONE, h, /*extendRegion*/ true); - JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", newH->bbNum, h->bbNum); - newH->bbWeight = newH->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; - BlockSetOps::Assign(this, newH->bbReach, h->bbReach); - // This is in the scope of a surrounding loop, if one exists -- the parent of the loop we're cloning. - newH->bbNatLoopNum = ambientLoop; - optUpdateLoopHead(loopInd, h, newH); + assert((h->bbFlags & BBF_LOOP_PREHEADER) != 0); - fgAddRefPred(newH, h); // Add h->newH pred edge - JITDUMP("Adding " FMT_BB " -> " FMT_BB "\n", h->bbNum, newH->bbNum); - fgReplacePred(newH->bbNext, h, newH); // Replace pred in COND fall-through block. - JITDUMP("Replace " FMT_BB " -> " FMT_BB " with " FMT_BB " -> " FMT_BB "\n", h->bbNum, newH->bbNext->bbNum, - newH->bbNum, newH->bbNext->bbNum); + // Make a new pre-header block 'h2' for the loop. 'h' will fall through to 'h2'. + JITDUMP("Create new header block for loop\n"); - h = newH; + BasicBlock* h2 = fgNewBBafter(BBJ_NONE, h, /*extendRegion*/ true); + JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", h2->bbNum, h->bbNum); + h2->bbWeight = h2->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; + h2->bbNatLoopNum = ambientLoop; + h2->bbFlags |= BBF_LOOP_PREHEADER; + + if (h->bbJumpKind != BBJ_NONE) + { + assert(h->bbJumpKind == BBJ_ALWAYS); + assert(h->bbJumpDest == loop.lpEntry); + h2->bbJumpKind = BBJ_ALWAYS; + h2->bbJumpDest = loop.lpEntry; } - assert(h == loop.lpHead); + + fgReplacePred(loop.lpEntry, h, h2); + JITDUMP("Replace " FMT_BB " -> " FMT_BB " with " FMT_BB " -> " FMT_BB "\n", h->bbNum, loop.lpEntry->bbNum, + h2->bbNum, loop.lpEntry->bbNum); + + // 'h' is no longer the loop head; 'h2' is! + h->bbFlags &= ~BBF_LOOP_PREHEADER; + optUpdateLoopHead(loopInd, h, h2); + + // Make 'h' fall through to 'h2' (if it didn't already). + // Don't add the h->h2 edge because we're going to insert the cloning conditions between 'h' and 'h2', and + // optInsertLoopChoiceConditions() will add the edge. + h->bbJumpKind = BBJ_NONE; + h->bbJumpDest = nullptr; // Make X2 after B, if necessary. (Not necessary if B is a BBJ_ALWAYS.) // "newPred" will be the predecessor of the blocks of the cloned loop. @@ -2168,41 +2100,21 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) } } - // We're going to create a new loop head for the slow loop immediately before the slow loop itself. All failed - // conditions will branch to the slow head. The slow head will either fall through to the entry, or unconditionally - // branch to the slow path entry. This puts the slow loop in the canonical loop form. - BasicBlock* slowHeadPrev = newPred; - - // Now we'll make "h2", after "h" to go to "e" -- unless the loop is a do-while, - // so that "h" already falls through to "e" (e == t). - // It might look like this code is unreachable, since "h" must be a BBJ_ALWAYS, but - // later we will change "h" to a BBJ_COND along with a set of loop conditions. - // Cloning is currently restricted to "do-while" loop forms, where this case won't occur. - // However, it can occur in OSR methods. - BasicBlock* h2 = nullptr; - if (h->bbNext != loop.lpEntry) - { - assert(h->bbJumpKind == BBJ_ALWAYS); - JITDUMP("Create branch to entry of optimized loop\n"); - h2 = fgNewBBafter(BBJ_ALWAYS, h, /*extendRegion*/ true); - JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", h2->bbNum, h->bbNum); - h2->bbWeight = h2->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; - - // This is in the scope of a surrounding loop, if one exists -- the parent of the loop we're cloning. - h2->bbNatLoopNum = ambientLoop; - - h2->bbJumpDest = loop.lpEntry; - fgReplacePred(loop.lpEntry, h, h2); - JITDUMP("Replace " FMT_BB " -> " FMT_BB " with " FMT_BB " -> " FMT_BB "\n", h->bbNum, loop.lpEntry->bbNum, - h2->bbNum, loop.lpEntry->bbNum); - - optUpdateLoopHead(loopInd, h, h2); - - // NOTE: 'h' is no longer the loop head; 'h2' is! - } + // Create a new loop head for the slow loop immediately before the slow loop itself. All failed + // conditions will branch to the slow head. The slow head will either fall through or unconditionally + // branch to the slow path entry. The slowHead will be a loop pre-header, however we don't put the slow loop + // in the loop table, so it isn't marked as a pre-header (if we re-built the loop table after cloning, slowHead + // would end up in the loop table as a pre-header without creating any new pre-header block). This puts the + // slow loop in the canonical loop form. + JITDUMP("Create unique head block for slow path loop\n"); + BasicBlock* slowHead = fgNewBBafter(BBJ_NONE, newPred, /*extendRegion*/ true); + JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", slowHead->bbNum, newPred->bbNum); + slowHead->bbWeight = newPred->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; + slowHead->scaleBBWeight(slowPathWeightScaleFactor); + slowHead->bbNatLoopNum = ambientLoop; + newPred = slowHead; // Now we'll clone the blocks of the loop body. These cloned blocks will be the slow path. - BasicBlock* newFirst = nullptr; BlockToBlockMap* blockMap = new (getAllocator(CMK_LoopClone)) BlockToBlockMap(getAllocator(CMK_LoopClone)); for (BasicBlock* const blk : loop.LoopBlocks()) @@ -2229,22 +2141,28 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) // If the original loop is aligned, do not align the cloned loop because cloned loop will be executed in // rare scenario. Additionally, having to align cloned loop will force us to disable some VEX prefix encoding // and adding compensation for over-estimated instructions. - if (blk->isLoopAlign()) + if (newBlk->isLoopAlign()) { newBlk->bbFlags &= ~BBF_LOOP_ALIGN; JITDUMP("Removing LOOP_ALIGN flag from cloned loop in " FMT_BB "\n", newBlk->bbNum); } #endif + // If the loop we're cloning contains nested loops, we need to clear the pre-header bit on + // any nested loop pre-header blocks, since they will no longer be loop pre-headers. (This is because + // we don't add the slow loop or its child loops to the loop table. It would be simplest to + // just re-build the loop table if we want to enable loop optimization of the slow path loops.) + if ((newBlk->bbFlags & BBF_LOOP_PREHEADER) != 0) + { + JITDUMP("Removing BBF_LOOP_PREHEADER flag from nested cloned loop block " FMT_BB "\n", newBlk->bbNum); + newBlk->bbFlags &= ~BBF_LOOP_PREHEADER; + } + // TODO-Cleanup: The above clones the bbNatLoopNum, which is incorrect. Eventually, we should probably insert // the cloned loop in the loop table. For now, however, we'll just make these blocks be part of the surrounding // loop, if one exists -- the parent of the loop we're cloning. newBlk->bbNatLoopNum = loop.lpParent; - if (newFirst == nullptr) - { - newFirst = newBlk; - } newPred = newBlk; blockMap->Set(blk, newBlk); } @@ -2321,44 +2239,25 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) // !cond1 -?> slowHead // ... // !condn -?> slowHead - // h2/entry (fast) + // h2 -?> e (fast loop pre-header branch or fall-through to e) // ... - // slowHead -?> e2 (slowHead) branch or fall-through to e2 + // slowHead -?> e2 (slow loop pre-header branch or fall-through to e2) // // We should always have block conditions. assert(context->HasBlockConditions(loopInd)); - - if (h->bbJumpKind == BBJ_NONE) - { - assert(h->bbNext == loop.lpEntry); - fgRemoveRefPred(h->bbNext, h); - } - else - { - assert(h->bbJumpKind == BBJ_ALWAYS); - assert(h->bbJumpDest == loop.lpEntry); - assert(h2 != nullptr); - h->bbJumpKind = BBJ_NONE; - h->bbJumpDest = nullptr; - } + assert(h->bbJumpKind == BBJ_NONE); + assert(h->bbNext == h2); // If any condition is false, go to slowHead (which branches or falls through to e2). BasicBlock* e2 = nullptr; bool foundIt = blockMap->Lookup(loop.lpEntry, &e2); assert(foundIt && e2 != nullptr); - // Create a unique header for the slow path. - JITDUMP("Create unique head block for slow path loop\n"); - BasicBlock* slowHead = fgNewBBafter(BBJ_NONE, slowHeadPrev, /*extendRegion*/ true); - JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", slowHead->bbNum, slowHeadPrev->bbNum); - slowHead->bbWeight = slowHeadPrev->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; - slowHead->scaleBBWeight(slowPathWeightScaleFactor); - slowHead->bbNatLoopNum = ambientLoop; - if (slowHead->bbNext != e2) { // We can't just fall through to the slow path entry, so make it an unconditional branch. + assert(slowHead->bbJumpKind == BBJ_NONE); // This is how we created it above. slowHead->bbJumpKind = BBJ_ALWAYS; slowHead->bbJumpDest = e2; } @@ -2374,12 +2273,6 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) JITDUMP("Adding " FMT_BB " -> " FMT_BB "\n", condLast->bbNum, condLast->bbNext->bbNum); fgAddRefPred(condLast->bbNext, condLast); - // If h2 is present it is already the head. Else, replace 'h' as the loop head by 'condLast'. - if (h2 == nullptr) - { - optUpdateLoopHead(loopInd, loop.lpHead, condLast); - } - // Don't unroll loops that we've cloned -- the unroller expects any loop it should unroll to // initialize the loop counter immediately before entering the loop, but we've left a shared // initialization of the loop counter up above the test that determines which version of the @@ -3172,6 +3065,7 @@ bool Compiler::optObtainLoopCloningOpts(LoopCloneContext* context) // Return Value: // true if loop cloning is allowed, false if disallowed. // +// static bool Compiler::optLoopCloningEnabled() { #ifdef DEBUG diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index c1ded572990688..96790a373513c8 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -23,10 +23,11 @@ void Compiler::optInit() loopAlignCandidates = 0; /* Initialize the # of tracked loops to 0 */ - optLoopCount = 0; - optLoopTable = nullptr; - optLoopTableValid = false; - optCurLoopEpoch = 0; + optLoopCount = 0; + optLoopTable = nullptr; + optLoopTableValid = false; + optLoopsRequirePreHeaders = false; + optCurLoopEpoch = 0; #ifdef DEBUG loopsAligned = 0; @@ -807,15 +808,32 @@ bool Compiler::optPopulateInitInfo(unsigned loopInd, BasicBlock* initBlock, GenT return false; } - // We found an initializer in the `head` block. For this to be used, we need to make sure the + // We found an initializer in the `initBlock` block. For this to be used, we need to make sure the // "iterVar" initialization is never skipped. That is, every pred of ENTRY other than HEAD is in the loop. + // We allow one special case: the HEAD block is an empty predecessor to ENTRY, and the initBlock is the + // only predecessor to HEAD. This handles the case where we rebuild the loop table (after inserting + // pre-headers) and we still want to find the initializer before the pre-header block. for (BasicBlock* const predBlock : optLoopTable[loopInd].lpEntry->PredBlocks()) { - if ((predBlock != initBlock) && !optLoopTable[loopInd].lpContains(predBlock)) + if (!optLoopTable[loopInd].lpContains(predBlock)) { - JITDUMP(FMT_LP ": initialization not guaranteed through `head` block; ignore constant initializer\n", - loopInd); - return false; + bool initBlockOk = (predBlock == initBlock); + if (!initBlockOk) + { + if ((predBlock->bbJumpKind == BBJ_NONE) && (predBlock->bbNext == optLoopTable[loopInd].lpEntry) && + (predBlock->countOfInEdges() == 1) && (predBlock->firstStmt() == nullptr) && + (predBlock->bbPrev != nullptr) && predBlock->bbPrev->bbFallsThrough()) + { + initBlockOk = true; + } + } + if (!initBlockOk) + { + JITDUMP(FMT_LP ": initialization not guaranteed from " FMT_BB " through to entry block " FMT_BB + " from pred " FMT_BB "; ignore constant initializer\n", + loopInd, initBlock->bbNum, optLoopTable[loopInd].lpEntry->bbNum, predBlock->bbNum); + return false; + } } } @@ -1131,12 +1149,13 @@ bool Compiler::optIsLoopTestEvalIntoTemp(Statement* testStmt, Statement** newTes // Extract the "init", "test" and "incr" nodes of the loop. // // Arguments: -// head - Loop head block -// bottom - Loop bottom block -// top - Loop top block -// ppInit - The init stmt of the loop if found. -// ppTest - The test stmt of the loop if found. -// ppIncr - The incr stmt of the loop if found. +// pInitBlock - [IN/OUT] *pInitBlock is the loop head block on entry, and is set to the initBlock on exit, +// if `**ppInit` is non-null. +// bottom - Loop bottom block +// top - Loop top block +// ppInit - The init stmt of the loop if found. +// ppTest - The test stmt of the loop if found. +// ppIncr - The incr stmt of the loop if found. // // Return Value: // The results are put in "ppInit", "ppTest" and "ppIncr" if the method @@ -1145,9 +1164,8 @@ bool Compiler::optIsLoopTestEvalIntoTemp(Statement* testStmt, Statement** newTes // to nullptr. Return value will never be false if `init` is not found. // // Operation: -// Check if the "test" stmt is last stmt in the loop "bottom". If found good, -// "test" stmt is found. Try to find the "incr" stmt. Check previous stmt of -// "test" to get the "incr" stmt. If it is not found it could be a loop of the +// Check if the "test" stmt is last stmt in the loop "bottom". Try to find the "incr" stmt. +// Check previous stmt of "test" to get the "incr" stmt. If it is not found it could be a loop of the // below form. // // +-------<-----------------<-----------+ @@ -1164,8 +1182,9 @@ bool Compiler::optIsLoopTestEvalIntoTemp(Statement* testStmt, Statement** newTes // the callers are expected to verify that "iterVar" is used in the test. // bool Compiler::optExtractInitTestIncr( - BasicBlock* head, BasicBlock* bottom, BasicBlock* top, GenTree** ppInit, GenTree** ppTest, GenTree** ppIncr) + BasicBlock** pInitBlock, BasicBlock* bottom, BasicBlock* top, GenTree** ppInit, GenTree** ppTest, GenTree** ppIncr) { + assert(pInitBlock != nullptr); assert(ppInit != nullptr); assert(ppTest != nullptr); assert(ppIncr != nullptr); @@ -1217,7 +1236,22 @@ bool Compiler::optExtractInitTestIncr( // Find the last statement in the loop pre-header which we expect to be the initialization of // the loop iterator. - Statement* phdrStmt = head->firstStmt(); + BasicBlock* initBlock = *pInitBlock; + Statement* phdrStmt = initBlock->firstStmt(); + if (phdrStmt == nullptr) + { + // When we build the loop table, we canonicalize by introducing loop pre-headers for all loops. + // If we are rebuilding the loop table, we would already have the pre-header block introduced + // the first time, which might be empty if no hoisting has yet occurred. In this case, look a + // little harder for the possible loop initialization statement. + if ((initBlock->bbJumpKind == BBJ_NONE) && (initBlock->bbNext == top) && (initBlock->countOfInEdges() == 1) && + (initBlock->bbPrev != nullptr) && initBlock->bbPrev->bbFallsThrough()) + { + initBlock = initBlock->bbPrev; + phdrStmt = initBlock->firstStmt(); + } + } + if (phdrStmt != nullptr) { Statement* initStmt = phdrStmt->GetPrevStmt(); @@ -1234,11 +1268,6 @@ bool Compiler::optExtractInitTestIncr( // statements other than duplicated loop conditions. doGetPrev = (initStmt->GetPrevStmt() != nullptr); } - else - { - // Must be a duplicated loop condition. - noway_assert(initStmt->GetRootNode()->gtOper == GT_JTRUE); - } #endif // DEBUG if (doGetPrev) { @@ -1247,7 +1276,8 @@ bool Compiler::optExtractInitTestIncr( noway_assert(initStmt != nullptr); } - *ppInit = initStmt->GetRootNode(); + *ppInit = initStmt->GetRootNode(); + *pInitBlock = initBlock; } else { @@ -1371,24 +1401,25 @@ bool Compiler::optRecordLoop( // if (bottom->bbJumpKind == BBJ_COND) { - GenTree* init; - GenTree* test; - GenTree* incr; - if (!optExtractInitTestIncr(head, bottom, top, &init, &test, &incr)) + GenTree* init; + GenTree* test; + GenTree* incr; + BasicBlock* initBlock = head; + if (!optExtractInitTestIncr(&initBlock, bottom, top, &init, &test, &incr)) { JITDUMP(FMT_LP ": couldn't find init/test/incr; not LPFLG_ITER loop\n", loopInd); goto DONE_LOOP; } unsigned iterVar = BAD_VAR_NUM; - if (!optComputeIterInfo(incr, head->bbNext, bottom, &iterVar)) + if (!optComputeIterInfo(incr, top, bottom, &iterVar)) { JITDUMP(FMT_LP ": increment expression not appropriate form, or not loop invariant; not LPFLG_ITER loop\n", loopInd); goto DONE_LOOP; } - optPopulateInitInfo(loopInd, head, init, iterVar); + optPopulateInitInfo(loopInd, initBlock, init, iterVar); // Check that the iterator is used in the loop condition. if (!optCheckIterInLoopTest(loopInd, test, iterVar)) @@ -2392,7 +2423,7 @@ class LoopSearch block->bbFlags |= BBF_KEEP_BBJ_ALWAYS; } - // If block is newNext's only predcessor, move the IR from block to newNext, + // If block is newNext's only predecessor, move the IR from block to newNext, // but keep the now-empty block around. // // We move the IR because loop recognition has a very limited search capability and @@ -2687,19 +2718,22 @@ void Compiler::optFindNaturalLoops() fgUpdateChangedFlowGraph(FlowGraphUpdates::COMPUTE_DOMS); } - if (false /* pre-header stress */) + // Create a loop pre-header for every loop. + bool modForPreHeader = false; + for (unsigned loopInd = 0; loopInd < optLoopCount; loopInd++) { - // Stress mode: aggressively create loop pre-header for every loop. - for (unsigned loopInd = 0; loopInd < optLoopCount; loopInd++) - { - fgCreateLoopPreHeader(loopInd); - } - - if (fgModified) + if (fgCreateLoopPreHeader(loopInd)) { - fgUpdateChangedFlowGraph(FlowGraphUpdates::COMPUTE_DOMS); + modForPreHeader = true; } } + if (modForPreHeader) + { + fgUpdateChangedFlowGraph(FlowGraphUpdates::COMPUTE_DOMS); + } + + // Starting now, we require all loops to have pre-headers. + optLoopsRequirePreHeaders = true; #ifdef DEBUG if (verbose && (optLoopCount > 0)) @@ -3703,11 +3737,11 @@ bool Compiler::optComputeLoopRep(int constInit, // Initialize loopCount to zero. loopCount = 0; - // If dupCond is true then the loop head contains a test which skips + // If dupCond is true then the loop initialization block contains a test which skips // this loop, if the constInit does not pass the loop test. // Such a loop can execute zero times. - // If dupCond is false then we have a true do-while loop which we - // always execute the loop once before performing the loop test + // If dupCond is false then we have a true do-while loop where we + // always execute the loop once before performing the loop test. if (!dupCond) { loopCount += 1; @@ -4527,14 +4561,19 @@ PhaseStatus Compiler::optUnrollLoops() fgRemoveAllRefPreds(succ, block); } - block->bbStmtList = nullptr; - block->bbJumpKind = BBJ_NONE; - block->bbFlags &= ~BBF_LOOP_HEAD; + block->bbStmtList = nullptr; + block->bbJumpKind = BBJ_NONE; block->bbJumpDest = nullptr; block->bbNatLoopNum = newLoopNum; + + // Remove a few unnecessary flags (this list is not comprehensive). + block->bbFlags &= ~(BBF_LOOP_HEAD | BBF_BACKWARD_JUMP_SOURCE | BBF_BACKWARD_JUMP_TARGET | + BBF_HAS_IDX_LEN | BBF_HAS_MD_IDX_LEN | BBF_HAS_MDARRAYREF | BBF_HAS_NEWOBJ); + + JITDUMP("Scrubbed old loop body block " FMT_BB "\n", block->bbNum); } - // The old loop blocks will form an emtpy linear chain. + // The old loop blocks will form an empty linear chain. // Add back a suitable pred list links. // BasicBlock* oldLoopPred = head; @@ -4554,24 +4593,37 @@ PhaseStatus Compiler::optUnrollLoops() // Now fix up the exterior flow and pred list entries. // - // Control will fall through from HEAD to its successor, which is either - // the now empty TOP (if totalIter == 0) or the first cloned top. + // Control will fall through from the initBlock to its successor, which is either + // the pre-header HEAD (if it exists), or the now empty TOP (if totalIter == 0), + // or the first cloned top. // - // If the HEAD is a BBJ_COND drop the condition (and make HEAD a BBJ_NONE block). + // If the initBlock is a BBJ_COND drop the condition (and make initBlock a BBJ_NONE block). // - if (head->bbJumpKind == BBJ_COND) + if (initBlock->bbJumpKind == BBJ_COND) { - testStmt = head->lastStmt(); - noway_assert(testStmt->GetRootNode()->gtOper == GT_JTRUE); - fgRemoveStmt(head, testStmt); - fgRemoveRefPred(head->bbJumpDest, head); - head->bbJumpKind = BBJ_NONE; + assert(dupCond); + Statement* initBlockBranchStmt = initBlock->lastStmt(); + noway_assert(initBlockBranchStmt->GetRootNode()->OperIs(GT_JTRUE)); + fgRemoveStmt(initBlock, initBlockBranchStmt); + fgRemoveRefPred(initBlock->bbJumpDest, initBlock); + initBlock->bbJumpKind = BBJ_NONE; } else { /* the loop must execute */ + assert(!dupCond); assert(totalIter > 0); - noway_assert(head->bbJumpKind == BBJ_NONE); + noway_assert(initBlock->bbJumpKind == BBJ_NONE); + } + + // The loop will be removed, so no need to fix up the pre-header. + if (loop.lpFlags & LPFLG_HAS_PREHEAD) + { + assert(head->bbFlags & BBF_LOOP_PREHEADER); + + // For unrolled loops, all the unrolling preconditions require the pre-header block to fall + // through into TOP. + assert(head->bbJumpKind == BBJ_NONE); } // If we actually unrolled, tail is now reached @@ -5464,8 +5516,9 @@ void Compiler::optResetLoopInfo() // TODO: the loop table is always allocated as the same (maximum) size, so this is wasteful. // We could zero it out (possibly only in DEBUG) to be paranoid, but there's no reason to // force it to be re-allocated. - optLoopTable = nullptr; - optLoopTableValid = false; + optLoopTable = nullptr; + optLoopTableValid = false; + optLoopsRequirePreHeaders = false; for (BasicBlock* const block : Blocks()) { @@ -6618,7 +6671,7 @@ void Compiler::optPerformHoistExpr(GenTree* origExpr, BasicBlock* exprBb, unsign } //------------------------------------------------------------------------ -// optHoistLoopNest: run loop hoisting for indicated loop and all contained loops +// optHoistLoopCode: run loop hoisting phase // // Returns: // suitable phase status @@ -6751,9 +6804,6 @@ bool Compiler::optHoistLoopNest(unsigned lnum, LoopHoistContext* hoistCtxt) m_loopsConsidered++; #endif // LOOP_HOIST_STATS - BasicBlockList* firstPreHeader = nullptr; - BasicBlockList** preHeaderAppendPtr = &firstPreHeader; - bool modified = false; if (optLoopTable[lnum].lpChild != BasicBlock::NOT_IN_LOOP) @@ -6762,30 +6812,10 @@ bool Compiler::optHoistLoopNest(unsigned lnum, LoopHoistContext* hoistCtxt) child = optLoopTable[child].lpSibling) { modified |= optHoistLoopNest(child, hoistCtxt); - - if (optLoopTable[child].lpFlags & LPFLG_HAS_PREHEAD) - { - // If a pre-header is found, add it to the tracking list. Most likely, the recursive call - // to hoist from the `child` loop nest found something to hoist and created a pre-header - // where the hoisted code was placed. - - BasicBlock* preHeaderBlock = optLoopTable[child].lpHead; - JITDUMP(" PREHEADER: " FMT_BB "\n", preHeaderBlock->bbNum); - - // Here, we are arranging the blocks in reverse lexical order, so when they are removed from the - // head of this list and pushed on the stack of hoisting blocks to process, they are in - // forward lexical order when popped. Note that for child loops `i` and `j`, in order `i` - // followed by `j` in the `lpSibling` list, that `j` is actually first in lexical order - // and that makes these blocks arranged in reverse lexical order in this list. - // That is, the sibling list is in reverse lexical order. - // Note: the sibling list order should not matter to any algorithm; this order is not guaranteed. - *preHeaderAppendPtr = new (this, CMK_LoopHoist) BasicBlockList(preHeaderBlock, nullptr); - preHeaderAppendPtr = &((*preHeaderAppendPtr)->next); - } } } - modified |= optHoistThisLoop(lnum, hoistCtxt, firstPreHeader); + modified |= optHoistThisLoop(lnum, hoistCtxt); return modified; } @@ -6796,12 +6826,11 @@ bool Compiler::optHoistLoopNest(unsigned lnum, LoopHoistContext* hoistCtxt) // Arguments: // lnum - loop to process // hoistCtxt - context for the hoisting -// existingPreHeaders - list of preheaders created for child loops // // Returns: // true if any hoisting was done // -bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, BasicBlockList* existingPreHeaders) +bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt) { LoopDsc* pLoopDsc = &optLoopTable[lnum]; @@ -6828,10 +6857,9 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi VARSET_TP loopVars(VarSetOps::Intersection(this, pLoopDsc->lpVarInOut, pLoopDsc->lpVarUseDef)); - pLoopDsc->lpVarInOutCount = VarSetOps::Count(this, pLoopDsc->lpVarInOut); - pLoopDsc->lpLoopVarCount = VarSetOps::Count(this, loopVars); - pLoopDsc->lpHoistedExprCount = 0; - pLoopDsc->lpHoistAddedPreheader = false; + pLoopDsc->lpVarInOutCount = VarSetOps::Count(this, pLoopDsc->lpVarInOut); + pLoopDsc->lpLoopVarCount = VarSetOps::Count(this, loopVars); + pLoopDsc->lpHoistedExprCount = 0; #ifndef TARGET_64BIT @@ -6907,29 +6935,98 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi // Until we have post-dominators, we'll special-case for single-exit blocks. // // Todo: it is not clear if this is a correctness requirement or a profitability heuristic. - // It seems like the latter. Ideally have enough safeguards to prevent hoisting exception - // or side-effect dependent things. + // It seems like the latter. Ideally there are enough safeguards to prevent hoisting exception + // or side-effect dependent things. Note that HoistVisitor uses `m_beforeSideEffect` to determine if it's + // ok to hoist a side-effect. It allows this only for the first block (the entry block), before any + // side-effect has been seen. After the first block, it assumes that there has been a side effect and + // no further side-effect can be hoisted. It is true that we don't analyze any program behavior in the + // flow graph between the entry block and the subsequent blocks, whether they be the next block dominating + // the exit block, or the pre-headers of nested loops. // // We really should consider hoisting from conditionally executed blocks, if they are frequently executed // and it is safe to evaluate the tree early. // ArrayStack defExec(getAllocatorLoopHoist()); - BasicBlockList* preHeadersList = existingPreHeaders; + + // Add the pre-headers of any child loops to the list of blocks to consider for hoisting. + // Note that these are not necessarily definitely executed. However, it is a heuristic that they will + // often provide good opportunities for further hoisting since we hoist from inside-out, + // and the inner loop may have already hoisted something loop-invariant to them. If the child + // loop pre-header block would be added anyway (by dominating the loop exit block), we don't + // add it here, and let it be added naturally, below. + // + // Note that all pre-headers get added first, which means they get considered for hoisting last. It is + // assumed that the order does not matter for correctness (since there is no execution order known). + // Note that the order does matter for the hoisting profitability heuristics, as we might + // run out of hoisting budget when processing the blocks. + // + // For example, consider this loop nest: + // + // for (....) { // loop L00 + // pre-header 1 + // for (...) { // loop L01 + // } + // // pre-header 2 + // for (...) { // loop L02 + // // pre-header 3 + // for (...) { // loop L03 + // } + // } + // } + // + // When processing the outer loop L00 (with an assumed single exit), we will push on the defExec stack + // pre-header 2, pre-header 1, the loop exit block, any IDom tree blocks leading to the entry block, + // and finally the entry block. (Note that the child loop iteration order of a loop is from "farthest" + // from the loop "head" to "nearest".) Blocks are considered for hoisting in the opposite order. + // + // Note that pre-header 3 is not pushed, since it is not a direct child. It would have been processed + // when loop L02 was considered for hoisting. + // + // The order of pushing pre-header 1 and pre-header 2 is based on the order in the loop table (which is + // convenient). But note that it is arbitrary because there is not guaranteed execution order amongst + // the child loops. + + int childLoopPreHeaders = 0; + for (BasicBlock::loopNumber childLoop = pLoopDsc->lpChild; // + childLoop != BasicBlock::NOT_IN_LOOP; // + childLoop = optLoopTable[childLoop].lpSibling) + { + if (optLoopTable[childLoop].lpIsRemoved()) + { + continue; + } + INDEBUG(optLoopTable[childLoop].lpValidatePreHeader()); + BasicBlock* childPreHead = optLoopTable[childLoop].lpHead; + if (pLoopDsc->lpExitCnt == 1) + { + if (fgDominate(childPreHead, pLoopDsc->lpExit)) + { + // If the child loop pre-header dominates the exit, it will get added in the dominator tree + // loop below. + continue; + } + } + else + { + // If the child loop pre-header is the loop entry for a multi-exit loop, it will get added below. + if (childPreHead == pLoopDsc->lpEntry) + { + continue; + } + } + JITDUMP(" -- " FMT_BB " (child loop pre-header)\n", childPreHead->bbNum); + defExec.Push(childPreHead); + ++childLoopPreHeaders; + } if (pLoopDsc->lpExitCnt == 1) { assert(pLoopDsc->lpExit != nullptr); JITDUMP(" Considering hoisting in blocks that either dominate exit block " FMT_BB - " or preheaders of nested loops, if any:\n", + ", or pre-headers of nested loops, if any:\n", pLoopDsc->lpExit->bbNum); // Push dominators, until we reach "entry" or exit the loop. - // Also push the preheaders that were added for the nested loops, - // if any, along the way such that in the final list, the dominating - // blocks are visited before the dominated blocks. - // - // TODO-CQ: In future, we should create preheaders upfront before building - // dominators so we don't have to do this extra work here. BasicBlock* cur = pLoopDsc->lpExit; while (cur != nullptr && pLoopDsc->lpContains(cur) && cur != pLoopDsc->lpEntry) @@ -6937,22 +7034,6 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi JITDUMP(" -- " FMT_BB " (dominate exit block)\n", cur->bbNum); defExec.Push(cur); cur = cur->bbIDom; - - for (; preHeadersList != nullptr; preHeadersList = preHeadersList->next) - { - BasicBlock* preHeaderBlock = preHeadersList->block; - BasicBlock* lpEntry = optLoopEntry(preHeaderBlock); - if (cur->bbNum < lpEntry->bbNum) - { - JITDUMP(" -- " FMT_BB " (preheader of " FMT_LP ")\n", preHeaderBlock->bbNum, - lpEntry->bbNatLoopNum); - defExec.Push(preHeaderBlock); - } - else - { - break; - } - } } // If we didn't reach the entry block, give up and *just* push the entry block (and the pre-headers). @@ -6961,8 +7042,8 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi JITDUMP(" -- odd, we didn't reach entry from exit via dominators. Only considering hoisting in entry " "block " FMT_BB "\n", pLoopDsc->lpEntry->bbNum); - defExec.Reset(); - preHeadersList = existingPreHeaders; + defExec.Pop(defExec.Height() - childLoopPreHeaders); + assert(defExec.Height() == childLoopPreHeaders); } } else // More than one exit @@ -6974,22 +7055,13 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi pLoopDsc->lpEntry->bbNum, lnum); } - // Push the remaining preheaders, if any. - for (; preHeadersList != nullptr; preHeadersList = preHeadersList->next) - { - BasicBlock* preHeaderBlock = preHeadersList->block; - JITDUMP(" -- " FMT_BB " (preheader of " FMT_LP ")\n", preHeaderBlock->bbNum, - optLoopEntry(preHeaderBlock)->bbNatLoopNum); - defExec.Push(preHeaderBlock); - } - JITDUMP(" -- " FMT_BB " (entry block)\n", pLoopDsc->lpEntry->bbNum); defExec.Push(pLoopDsc->lpEntry); optHoistLoopBlocks(lnum, &defExec, hoistCtxt); const unsigned numHoisted = pLoopDsc->lpHoistedFPExprCount + pLoopDsc->lpHoistedExprCount; - return pLoopDsc->lpHoistAddedPreheader || (numHoisted > 0); + return numHoisted > 0; } bool Compiler::optIsProfitableToHoistTree(GenTree* tree, unsigned lnum) @@ -7852,9 +7924,8 @@ void Compiler::optHoistCandidate(GenTree* tree, BasicBlock* treeBb, unsigned lnu return; } - // Create a loop pre-header in which to put the hoisted code. - const bool newPreheader = fgCreateLoopPreHeader(lnum); - optLoopTable[lnum].lpHoistAddedPreheader |= newPreheader; + // We should already have a pre-header for the loop. + INDEBUG(optLoopTable[lnum].lpValidatePreHeader()); // If the block we're hoisting from and the pre-header are in different EH regions, don't hoist. // TODO: we could probably hoist things that won't raise exceptions, such as constants. @@ -7992,6 +8063,8 @@ bool Compiler::optVNIsLoopInvariant(ValueNum vn, unsigned lnum, VNSet* loopVnInv // If there already exists a block that meets the pre-header requirements, that block is marked // as a pre-header, and no flow graph modification is made. // +// A loop with a pre-header has the flag LPFLG_HAS_PREHEAD, and its pre-header block has the flag BBF_LOOP_PREHEADER. +// // Note that the pre-header block can be in a different EH region from blocks in the loop, including the // entry block. Code doing hoisting is required to check the EH legality of hoisting to the pre-header // before doing so. @@ -8003,17 +8076,13 @@ bool Compiler::optVNIsLoopInvariant(ValueNum vn, unsigned lnum, VNSet* loopVnInv // handle queries about the pre-header dominating other blocks, even without re-computing dominators. // The preds lists have been maintained. // -// Currently, if you create a pre-header but don't put any code in it, any subsequent fgUpdateFlowGraph() -// pass might choose to compact the empty pre-header with a predecessor block. That is, a pre-header -// block might disappear if not used. -// // The code does not depend on the order of the BasicBlock bbNum. // // Arguments: // lnum - loop index // // Returns: -// true if new preheader was created +// true if new pre-header was created // bool Compiler::fgCreateLoopPreHeader(unsigned lnum) { @@ -8026,7 +8095,7 @@ bool Compiler::fgCreateLoopPreHeader(unsigned lnum) LoopDsc& loop = optLoopTable[lnum]; - // Have we already created a loop-preheader block? + // Have we already created a loop pre-header block? if (loop.lpFlags & LPFLG_HAS_PREHEAD) { @@ -8035,6 +8104,9 @@ bool Compiler::fgCreateLoopPreHeader(unsigned lnum) return false; } + // Assert that we haven't created SSA. It is assumed that we create all loop pre-headers before building SSA. + assert(fgSsaPassesCompleted == 0); + BasicBlock* head = loop.lpHead; BasicBlock* top = loop.lpTop; BasicBlock* entry = loop.lpEntry; @@ -8216,35 +8288,6 @@ bool Compiler::fgCreateLoopPreHeader(unsigned lnum) // Link in the preHead block fgInsertBBbefore(top, preHead); - // Ideally we would re-run SSA and VN if we optimized by doing loop hoisting. - // However, that is too expensive at this point. Instead, we update the phi - // node block references, if we created pre-header block due to hoisting. - // This is sufficient because any definition participating in SSA that flowed - // into the phi via the loop header block will now flow through the preheader - // block from the header block. - - for (Statement* const stmt : top->Statements()) - { - GenTree* tree = stmt->GetRootNode(); - if (tree->OperGet() != GT_ASG) - { - break; - } - GenTree* op2 = tree->gtGetOp2(); - if (op2->OperGet() != GT_PHI) - { - break; - } - for (GenTreePhi::Use& use : op2->AsPhi()->Uses()) - { - GenTreePhiArg* phiArg = use.GetNode()->AsPhiArg(); - if (phiArg->gtPredBB == head) - { - phiArg->gtPredBB = preHead; - } - } - } - // In which EH region should the pre-header live? // // The pre-header block is added immediately before `top`.