From ae9c728293f2b2df9b92ed3f74c1ef03830a3d57 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Fri, 6 Jan 2023 13:02:06 -0800 Subject: [PATCH 01/14] Always create loop pre-header As part of finding natural loops and creating the loop table, create a loop pre-header for every loop. This simplifies a lot of downstream phases, as the loop pre-header will be guaranteed to exist, and will already exist in the dominator tree. Introduce code to preserve an empty pre-header block through the optimization phases. Remove now unnecessary code in hoisting and elsewhere. Fixes #77033, #62665 --- src/coreclr/jit/optimizer.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index c1ded572990688..a3c8ab226fb6d3 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2687,19 +2687,20 @@ void Compiler::optFindNaturalLoops() fgUpdateChangedFlowGraph(FlowGraphUpdates::COMPUTE_DOMS); } - if (false /* pre-header stress */) + // Create loop pre-header for every loop. + bool modForPreHeader = false; + for (unsigned loopInd = 0; loopInd < optLoopCount; loopInd++) { - // Stress mode: aggressively create loop pre-header for every loop. - for (unsigned loopInd = 0; loopInd < optLoopCount; loopInd++) + if (fgCreateLoopPreHeader(loopInd)) { - fgCreateLoopPreHeader(loopInd); - } - - if (fgModified) - { - fgUpdateChangedFlowGraph(FlowGraphUpdates::COMPUTE_DOMS); + modForPreHeader = true; } } + if (modForPreHeader) + { + // The predecessors were maintained in fgCreateLoopPreHeader; don't rebuild them. + fgUpdateChangedFlowGraph(FlowGraphUpdates::COMPUTE_DOMS); + } #ifdef DEBUG if (verbose && (optLoopCount > 0)) @@ -7992,6 +7993,8 @@ bool Compiler::optVNIsLoopInvariant(ValueNum vn, unsigned lnum, VNSet* loopVnInv // If there already exists a block that meets the pre-header requirements, that block is marked // as a pre-header, and no flow graph modification is made. // +// A loop with a pre-header has the flag LPFLG_HAS_PREHEAD, and its pre-header block has the flag BBF_LOOP_PREHEADER. +// // Note that the pre-header block can be in a different EH region from blocks in the loop, including the // entry block. Code doing hoisting is required to check the EH legality of hoisting to the pre-header // before doing so. @@ -8003,10 +8006,6 @@ bool Compiler::optVNIsLoopInvariant(ValueNum vn, unsigned lnum, VNSet* loopVnInv // handle queries about the pre-header dominating other blocks, even without re-computing dominators. // The preds lists have been maintained. // -// Currently, if you create a pre-header but don't put any code in it, any subsequent fgUpdateFlowGraph() -// pass might choose to compact the empty pre-header with a predecessor block. That is, a pre-header -// block might disappear if not used. -// // The code does not depend on the order of the BasicBlock bbNum. // // Arguments: @@ -8222,6 +8221,7 @@ bool Compiler::fgCreateLoopPreHeader(unsigned lnum) // This is sufficient because any definition participating in SSA that flowed // into the phi via the loop header block will now flow through the preheader // block from the header block. + // TODO: if we always create and maintain pre-headers before SSA, can we delete this? for (Statement* const stmt : top->Statements()) { From f291aa53b1bf0c979030f3df47a530976c320be9 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Fri, 10 Mar 2023 16:04:23 -0800 Subject: [PATCH 02/14] Fix loop unrolling to work with loop pre-headers --- src/coreclr/jit/optimizer.cpp | 54 ++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index a3c8ab226fb6d3..5a8b31d691460c 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -3704,11 +3704,11 @@ bool Compiler::optComputeLoopRep(int constInit, // Initialize loopCount to zero. loopCount = 0; - // If dupCond is true then the loop head contains a test which skips + // If dupCond is true then the loop initialization block contains a test which skips // this loop, if the constInit does not pass the loop test. // Such a loop can execute zero times. - // If dupCond is false then we have a true do-while loop which we - // always execute the loop once before performing the loop test + // If dupCond is false then we have a true do-while loop where we + // always execute the loop once before performing the loop test. if (!dupCond) { loopCount += 1; @@ -4528,14 +4528,19 @@ PhaseStatus Compiler::optUnrollLoops() fgRemoveAllRefPreds(succ, block); } - block->bbStmtList = nullptr; - block->bbJumpKind = BBJ_NONE; - block->bbFlags &= ~BBF_LOOP_HEAD; + block->bbStmtList = nullptr; + block->bbJumpKind = BBJ_NONE; block->bbJumpDest = nullptr; block->bbNatLoopNum = newLoopNum; + + // Remove a few unnecessary flags (this list is not comprehensive). + block->bbFlags &= ~(BBF_LOOP_HEAD | BBF_BACKWARD_JUMP_SOURCE | BBF_BACKWARD_JUMP_TARGET | + BBF_HAS_IDX_LEN | BBF_HAS_MD_IDX_LEN | BBF_HAS_MDARRAYREF | BBF_HAS_NEWOBJ); + + JITDUMP("Scrubbed old loop body block " FMT_BB "\n", block->bbNum); } - // The old loop blocks will form an emtpy linear chain. + // The old loop blocks will form an empty linear chain. // Add back a suitable pred list links. // BasicBlock* oldLoopPred = head; @@ -4555,24 +4560,39 @@ PhaseStatus Compiler::optUnrollLoops() // Now fix up the exterior flow and pred list entries. // - // Control will fall through from HEAD to its successor, which is either - // the now empty TOP (if totalIter == 0) or the first cloned top. + // Control will fall through from the initBlock to its successor, which is either + // the pre-header HEAD (if it exists), or the now empty TOP (if totalIter == 0), + // or the first cloned top. // - // If the HEAD is a BBJ_COND drop the condition (and make HEAD a BBJ_NONE block). + // If the initBlock is a BBJ_COND drop the condition (and make initBlock a BBJ_NONE block). // - if (head->bbJumpKind == BBJ_COND) + // Also, make sure HEAD is a BBJ_NONE block. + // + if (initBlock->bbJumpKind == BBJ_COND) { - testStmt = head->lastStmt(); - noway_assert(testStmt->GetRootNode()->gtOper == GT_JTRUE); - fgRemoveStmt(head, testStmt); - fgRemoveRefPred(head->bbJumpDest, head); - head->bbJumpKind = BBJ_NONE; + assert(dupCond); + Statement* initBlockBranchStmt = initBlock->lastStmt(); + noway_assert(initBlockBranchStmt->GetRootNode()->OperIs(GT_JTRUE)); + fgRemoveStmt(initBlock, initBlockBranchStmt); + fgRemoveRefPred(initBlock->bbJumpDest, initBlock); + initBlock->bbJumpKind = BBJ_NONE; } else { /* the loop must execute */ + assert(!dupCond); assert(totalIter > 0); - noway_assert(head->bbJumpKind == BBJ_NONE); + noway_assert(initBlock->bbJumpKind == BBJ_NONE); + } + + // The loop will be removed, so no need to fix up the pre-header. + if (loop.lpFlags & LPFLG_HAS_PREHEAD) + { + assert(head->bbFlags & BBF_LOOP_PREHEADER); + + // For unrolled loops, all the unrolling preconditions require the pre-header block to fall + // through into TOP. + assert(head->bbJumpKind == BBJ_NONE); } // If we actually unrolled, tail is now reached From 9a16b00360e658d0a62d59b5124e13cdf6e7b96d Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Fri, 10 Mar 2023 17:27:57 -0800 Subject: [PATCH 03/14] Add `optLoopsRequirePreHeaders` variable --- src/coreclr/jit/compiler.cpp | 5 +++-- src/coreclr/jit/compiler.h | 9 +++++---- src/coreclr/jit/fgdiagnostic.cpp | 6 ++++++ src/coreclr/jit/fgopt.cpp | 12 ++++++------ src/coreclr/jit/optimizer.cpp | 17 +++++++++++------ 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index fe398e51fbdc34..b9199d49b646cc 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -5003,8 +5003,9 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // Dominator and reachability sets are no longer valid. // The loop table is no longer valid. - fgDomsComputed = false; - optLoopTableValid = false; + fgDomsComputed = false; + optLoopTableValid = false; + optLoopsRequirePreHeaders = false; #ifdef DEBUG DoPhase(this, PHASE_STRESS_SPLIT_TREE, &Compiler::StressSplitTree); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index e21307de79e919..1e0624802a3dd1 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6512,10 +6512,11 @@ class Compiler bool fgHasLoops; // True if this method has any loops, set in fgComputeReachability public: - LoopDsc* optLoopTable; // loop descriptor table - bool optLoopTableValid; // info in loop table should be valid - unsigned char optLoopCount; // number of tracked loops - unsigned char loopAlignCandidates; // number of loops identified for alignment + LoopDsc* optLoopTable; // loop descriptor table + bool optLoopTableValid; // info in loop table should be valid + bool optLoopsRequirePreHeaders; // Do we require that all loops have pre-headers? + unsigned char optLoopCount; // number of tracked loops + unsigned char loopAlignCandidates; // number of loops identified for alignment // Every time we rebuild the loop table, we increase the global "loop epoch". Any loop indices or // loop table pointers from the previous epoch are invalid. diff --git a/src/coreclr/jit/fgdiagnostic.cpp b/src/coreclr/jit/fgdiagnostic.cpp index 176f10bc8cac1f..8a36ae86c9e941 100644 --- a/src/coreclr/jit/fgdiagnostic.cpp +++ b/src/coreclr/jit/fgdiagnostic.cpp @@ -4335,6 +4335,7 @@ void Compiler::fgDebugCheckSsa() // - All basic blocks with loop numbers set have a corresponding loop in the table // - All basic blocks without a loop number are not in a loop // - All parents of the loop with the block contain that block +// - If optLoopsRequirePreHeaders is true, the loop has a pre-header // - If the loop has a pre-header, it is valid // - The loop flags are valid // - no loop shares `top` with any of its children @@ -4593,6 +4594,11 @@ void Compiler::fgDebugCheckLoopTable() } } + if (optLoopsRequirePreHeaders) + { + assert((loop.lpFlags & LPFLG_HAS_PREHEAD) != 0); + } + // If the loop has a pre-header, ensure the pre-header form is correct. if ((loop.lpFlags & LPFLG_HAS_PREHEAD) != 0) { diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index f6cc772cd204a4..5851ce79dbee73 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -1983,16 +1983,14 @@ bool Compiler::fgCanCompactBlocks(BasicBlock* block, BasicBlock* bNext) } } - // We cannot compact a block that participates in loop - // alignment. + // We cannot compact a block that participates in loop alignment. // if ((bNext->countOfInEdges() > 1) && bNext->isLoopAlign()) { return false; } - // If we are trying to compact blocks from different loops - // that don't do it. + // Don't compact blocks from different loops. // if ((block->bbNatLoopNum != BasicBlock::NOT_IN_LOOP) && (bNext->bbNatLoopNum != BasicBlock::NOT_IN_LOOP) && (block->bbNatLoopNum != bNext->bbNatLoopNum)) @@ -2060,6 +2058,7 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext) // `block` can no longer be a loop pre-header (if it was before). // + assert(!optLoopsRequirePreHeaders || ((block->bbFlags & BBF_LOOP_PREHEADER) == 0)); block->bbFlags &= ~BBF_LOOP_PREHEADER; // Retarget all the other edges incident on bNext. Do this @@ -6027,8 +6026,9 @@ PhaseStatus Compiler::fgUpdateFlowGraphPhase() // Dominator and reachability sets are no longer valid. // The loop table is no longer valid. - fgDomsComputed = false; - optLoopTableValid = false; + fgDomsComputed = false; + optLoopTableValid = false; + optLoopsRequirePreHeaders = false; return madeChanges ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING; } diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 5a8b31d691460c..5c790ef820dcc1 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -23,10 +23,11 @@ void Compiler::optInit() loopAlignCandidates = 0; /* Initialize the # of tracked loops to 0 */ - optLoopCount = 0; - optLoopTable = nullptr; - optLoopTableValid = false; - optCurLoopEpoch = 0; + optLoopCount = 0; + optLoopTable = nullptr; + optLoopTableValid = false; + optLoopsRequirePreHeaders = false; + optCurLoopEpoch = 0; #ifdef DEBUG loopsAligned = 0; @@ -2702,6 +2703,9 @@ void Compiler::optFindNaturalLoops() fgUpdateChangedFlowGraph(FlowGraphUpdates::COMPUTE_DOMS); } + // Starting now, we require all loops to have pre-headers. + optLoopsRequirePreHeaders = true; + #ifdef DEBUG if (verbose && (optLoopCount > 0)) { @@ -5485,8 +5489,9 @@ void Compiler::optResetLoopInfo() // TODO: the loop table is always allocated as the same (maximum) size, so this is wasteful. // We could zero it out (possibly only in DEBUG) to be paranoid, but there's no reason to // force it to be re-allocated. - optLoopTable = nullptr; - optLoopTableValid = false; + optLoopTable = nullptr; + optLoopTableValid = false; + optLoopsRequirePreHeaders = false; for (BasicBlock* const block : Blocks()) { From 47d1f845d792de696b9286e3c52ae33980d00c27 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Tue, 14 Mar 2023 16:28:27 -0700 Subject: [PATCH 04/14] Prevent removing pre-header blocks --- src/coreclr/jit/block.h | 2 +- src/coreclr/jit/compiler.cpp | 3 +++ src/coreclr/jit/fgbasic.cpp | 2 ++ src/coreclr/jit/fgopt.cpp | 20 ++++++++++++++++++++ src/coreclr/jit/optimizer.cpp | 2 +- 5 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index da99e8d51efdfc..27a1d7249a9f40 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -541,7 +541,7 @@ enum BasicBlockFlags : unsigned __int64 // Flags to update when two blocks are compacted BBF_COMPACT_UPD = BBF_GC_SAFE_POINT | BBF_HAS_JMP | BBF_HAS_IDX_LEN | BBF_HAS_MD_IDX_LEN | BBF_BACKWARD_JUMP | \ - BBF_HAS_NEWOBJ | BBF_HAS_NULLCHECK | BBF_HAS_MDARRAYREF, + BBF_HAS_NEWOBJ | BBF_HAS_NULLCHECK | BBF_HAS_MDARRAYREF | BBF_LOOP_PREHEADER, // Flags a block should not have had before it is split. diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index b9199d49b646cc..f1ee62d702ba18 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -5104,6 +5104,9 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // Now that lowering is completed we can proceed to perform register allocation // + // // LSRA block ordering depends on bbNum. Renumber the blocks, if necessary, before LSRA. + // fgDomsComputed = false; + // fgRenumberBlocks(); auto linearScanPhase = [this]() { m_pLinearScan->doLinearScan(); }; DoPhase(this, PHASE_LINEAR_SCAN, linearScanPhase); diff --git a/src/coreclr/jit/fgbasic.cpp b/src/coreclr/jit/fgbasic.cpp index fe1f74c8b45138..4e52358207811b 100644 --- a/src/coreclr/jit/fgbasic.cpp +++ b/src/coreclr/jit/fgbasic.cpp @@ -4971,6 +4971,8 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable) JITDUMP("fgRemoveBlock " FMT_BB ", unreachable=%s\n", block->bbNum, dspBool(unreachable)); + assert(!optLoopsRequirePreHeaders || ((block->bbFlags & BBF_LOOP_PREHEADER) == 0)); + // If we've cached any mappings from switch blocks to SwitchDesc's (which contain only the // *unique* successors of the switch block), invalidate that cache, since an entry in one of // the SwitchDescs might be removed. diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 5851ce79dbee73..60300c1ef02268 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -1947,6 +1947,17 @@ bool Compiler::fgCanCompactBlocks(BasicBlock* block, BasicBlock* bNext) return false; } + // Don't allow removing an empty loop pre-header. + // We can compact a pre-header into an empty `block`, as long as we propagate the BBF_LOOP_PREHEADER + // bit properly. + if (optLoopsRequirePreHeaders) + { + if (((block->bbFlags & BBF_LOOP_PREHEADER) != 0) && (bNext->countOfInEdges() != 1)) + { + return false; + } + } + // Don't compact the first block if it was specially created as a scratch block. if (fgBBisScratch(block)) { @@ -3052,6 +3063,15 @@ bool Compiler::fgOptimizeEmptyBlock(BasicBlock* block) break; } + // Don't delete empty loop pre-headers. + if (optLoopsRequirePreHeaders) + { + if ((block->bbFlags & BBF_LOOP_PREHEADER) != 0) + { + break; + } + } + /* special case if this is the last BB */ if (block == fgLastBB) { diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 5c790ef820dcc1..d27c6808c61882 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2393,7 +2393,7 @@ class LoopSearch block->bbFlags |= BBF_KEEP_BBJ_ALWAYS; } - // If block is newNext's only predcessor, move the IR from block to newNext, + // If block is newNext's only predecessor, move the IR from block to newNext, // but keep the now-empty block around. // // We move the IR because loop recognition has a very limited search capability and From 5f7700434ef0802025cd864be4b4862a928bd5ec Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Sun, 26 Mar 2023 12:47:01 -0700 Subject: [PATCH 05/14] Allow removing unreachable pre-headers Disallow creating pre-header after SSA is built --- src/coreclr/jit/fgbasic.cpp | 2 +- src/coreclr/jit/optimizer.cpp | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/fgbasic.cpp b/src/coreclr/jit/fgbasic.cpp index 4e52358207811b..868c02d0faf5ce 100644 --- a/src/coreclr/jit/fgbasic.cpp +++ b/src/coreclr/jit/fgbasic.cpp @@ -4971,7 +4971,7 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable) JITDUMP("fgRemoveBlock " FMT_BB ", unreachable=%s\n", block->bbNum, dspBool(unreachable)); - assert(!optLoopsRequirePreHeaders || ((block->bbFlags & BBF_LOOP_PREHEADER) == 0)); + assert(unreachable || !optLoopsRequirePreHeaders || ((block->bbFlags & BBF_LOOP_PREHEADER) == 0)); // If we've cached any mappings from switch blocks to SwitchDesc's (which contain only the // *unique* successors of the switch block), invalidate that cache, since an entry in one of diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index d27c6808c61882..ff3e9c043a75f2 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -8248,6 +8248,11 @@ bool Compiler::fgCreateLoopPreHeader(unsigned lnum) // block from the header block. // TODO: if we always create and maintain pre-headers before SSA, can we delete this? + // Assert that we haven't created SSA. It is assumed that we create all loop pre-headers before + // building SSA. + assert(fgSsaPassesCompleted == 0); + +#if 0 for (Statement* const stmt : top->Statements()) { GenTree* tree = stmt->GetRootNode(); @@ -8269,6 +8274,7 @@ bool Compiler::fgCreateLoopPreHeader(unsigned lnum) } } } +#endif // 0 // In which EH region should the pre-header live? // From 4fee3bc150061c7025a2df7414fe5922fdc88c89 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Sun, 26 Mar 2023 12:54:34 -0700 Subject: [PATCH 06/14] Make optLoopCloningEnabled() static --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/loopcloning.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 1e0624802a3dd1..9cb5bccd947351 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7651,7 +7651,7 @@ class Compiler bool optCheckLoopCloningGDVTestProfitable(GenTreeOp* guard, LoopCloneVisitorInfo* info); bool optIsHandleOrIndirOfHandle(GenTree* tree, GenTreeFlags handleType); - bool optLoopCloningEnabled(); + static bool optLoopCloningEnabled(); #ifdef DEBUG void optDebugLogLoopCloning(BasicBlock* block, Statement* insertBefore); diff --git a/src/coreclr/jit/loopcloning.cpp b/src/coreclr/jit/loopcloning.cpp index d6acb100e62580..ad4ff7e1b3ed89 100644 --- a/src/coreclr/jit/loopcloning.cpp +++ b/src/coreclr/jit/loopcloning.cpp @@ -3172,6 +3172,7 @@ bool Compiler::optObtainLoopCloningOpts(LoopCloneContext* context) // Return Value: // true if loop cloning is allowed, false if disallowed. // +// static bool Compiler::optLoopCloningEnabled() { #ifdef DEBUG From e82478d584e4f05a8a64bfafd38731e95bb19829 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Sun, 26 Mar 2023 22:27:31 -0700 Subject: [PATCH 07/14] Teach loop cloning to expect and respect loop pre-headers --- src/coreclr/jit/compiler.h | 3 +- src/coreclr/jit/fgflow.cpp | 21 +-- src/coreclr/jit/loopcloning.cpp | 229 +++++++++----------------------- 3 files changed, 69 insertions(+), 184 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 9cb5bccd947351..d1fab2fc7222cb 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6262,7 +6262,6 @@ class Compiler PhaseStatus optCloneLoops(); void optCloneLoop(unsigned loopInd, LoopCloneContext* context); - void optEnsureUniqueHead(unsigned loopInd, weight_t ambientWeight); PhaseStatus optUnrollLoops(); // Unrolls loops (needs to have cost info) void optRemoveRedundantZeroInits(); PhaseStatus optIfConversion(); // If conversion @@ -6643,7 +6642,7 @@ class Compiler bool optIsLoopEntry(BasicBlock* block) const; // The depth of the loop described by "lnum" (an index into the loop table.) (0 == top level) - unsigned optLoopDepth(unsigned lnum) + unsigned optLoopDepth(unsigned lnum) const { assert(lnum < optLoopCount); unsigned depth = 0; diff --git a/src/coreclr/jit/fgflow.cpp b/src/coreclr/jit/fgflow.cpp index cf91ff3d33b280..5f265ea783a8cf 100644 --- a/src/coreclr/jit/fgflow.cpp +++ b/src/coreclr/jit/fgflow.cpp @@ -361,27 +361,20 @@ void Compiler::fgRemoveBlockAsPred(BasicBlock* block) fgRemoveRefPred(bNext, bNext->bbPreds->getSourceBlock()); } } + fgRemoveRefPred(block->bbJumpDest, block); + break; - FALLTHROUGH; - - case BBJ_COND: case BBJ_ALWAYS: case BBJ_EHCATCHRET: - - /* Update the predecessor list for 'block->bbJumpDest' and 'block->bbNext' */ fgRemoveRefPred(block->bbJumpDest, block); - - if (block->bbJumpKind != BBJ_COND) - { - break; - } - - /* If BBJ_COND fall through */ - FALLTHROUGH; + break; case BBJ_NONE: + fgRemoveRefPred(block->bbNext, block); + break; - /* Update the predecessor list for 'block->bbNext' */ + case BBJ_COND: + fgRemoveRefPred(block->bbJumpDest, block); fgRemoveRefPred(block->bbNext, block); break; diff --git a/src/coreclr/jit/loopcloning.cpp b/src/coreclr/jit/loopcloning.cpp index ad4ff7e1b3ed89..681df8e88394fa 100644 --- a/src/coreclr/jit/loopcloning.cpp +++ b/src/coreclr/jit/loopcloning.cpp @@ -1970,81 +1970,6 @@ BasicBlock* Compiler::optInsertLoopChoiceConditions(LoopCloneContext* context, return insertAfter; } -//------------------------------------------------------------------------ -// OptEnsureUniqueHead: Ensure that loop "loopInd" has a unique head block. -// If the existing entry has non-loop predecessors other than the head entry, -// create a new, empty block that goes (only) to the entry, and redirects the -// preds of the entry to this new block. Sets the weight of the newly created -// block to "ambientWeight". -// -// NOTE: this is currently dead code, because it is only called by loop cloning, -// and loop cloning only works with single-entry loops where the immediately -// preceding head block is the only predecessor of the loop entry. -// -// Arguments: -// loopInd - index of loop to process -// ambientWeight - weight to give the new head, if created. -// -void Compiler::optEnsureUniqueHead(unsigned loopInd, weight_t ambientWeight) -{ - LoopDsc& loop = optLoopTable[loopInd]; - - BasicBlock* h = loop.lpHead; - BasicBlock* t = loop.lpTop; - BasicBlock* e = loop.lpEntry; - BasicBlock* b = loop.lpBottom; - - // If "h" dominates the entry block, then it is the unique header. - if (fgDominate(h, e)) - { - return; - } - - // Otherwise, create a new empty header block, make it the pred of the entry block, - // and redirect the preds of the entry block to go to this. - - BasicBlock* beforeTop = t->bbPrev; - assert(!beforeTop->bbFallsThrough() || (beforeTop->bbNext == e)); - - // Make sure that the new block is in the same region as the loop. - // (We will only create loops that are entirely within a region.) - BasicBlock* h2 = fgNewBBafter(BBJ_NONE, beforeTop, /*extendRegion*/ true); - assert(beforeTop->bbNext == h2); - - // This is in the containing loop. - h2->bbNatLoopNum = loop.lpParent; - h2->bbWeight = h2->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; - - if (h2->bbNext != e) - { - h2->bbJumpKind = BBJ_ALWAYS; - h2->bbJumpDest = e; - } - BlockSetOps::Assign(this, h2->bbReach, e->bbReach); - - fgAddRefPred(e, h2); - - // Redirect paths from preds of "e" to go to "h2" instead of "e". - BlockToBlockMap* blockMap = new (getAllocator(CMK_LoopClone)) BlockToBlockMap(getAllocator(CMK_LoopClone)); - blockMap->Set(e, h2); - - for (BasicBlock* const predBlock : e->PredBlocks()) - { - // Skip if predBlock is in the loop. - if (t->bbNum <= predBlock->bbNum && predBlock->bbNum <= b->bbNum) - { - continue; - } - - optRedirectBlock(predBlock, blockMap); - - fgAddRefPred(h2, predBlock); - fgRemoveRefPred(e, predBlock); - } - - optUpdateLoopHead(loopInd, h, h2); -} - //------------------------------------------------------------------------ // optCloneLoop: Perform the mechanical cloning of the specified loop // @@ -2087,12 +2012,9 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) // the loop being cloned. unsigned char ambientLoop = loop.lpParent; - // First, make sure that the loop has a unique header block, creating an empty one if necessary. - optEnsureUniqueHead(loopInd, ambientWeight); - // We're going to transform this loop: // - // H --> E (or, H conditionally branches around the loop and has fall-through to T == E) + // H --> E (if T == E, H falls through to T/E) // T // E // B ?-> T @@ -2101,11 +2023,11 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) // to this pair of loops: // // H ?-> H3 (all loop failure conditions branch to new slow path loop head) - // H2--> E (Optional; if T == E, let H fall through to T/E) + // H2 --> E (if T == E, H2 will fall through to T/E) // T // E // B ?-> T - // X2--> X + // X2 --> X (only need this if B -> is conditional, not BBJ_ALWAYS) // H3 --> E2 (aka slowHead. Or, H3 falls through to T2 == E2) // T2 // E2 @@ -2113,28 +2035,38 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) // X BasicBlock* h = loop.lpHead; - if (!h->KindIs(BBJ_NONE, BBJ_ALWAYS)) - { - // Make a new block to be the unique entry to the loop. - JITDUMP("Create new unique single-successor entry to loop\n"); - assert((h->bbJumpKind == BBJ_COND) && (h->bbNext == loop.lpEntry)); - BasicBlock* newH = fgNewBBafter(BBJ_NONE, h, /*extendRegion*/ true); - JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", newH->bbNum, h->bbNum); - newH->bbWeight = newH->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; - BlockSetOps::Assign(this, newH->bbReach, h->bbReach); - // This is in the scope of a surrounding loop, if one exists -- the parent of the loop we're cloning. - newH->bbNatLoopNum = ambientLoop; - optUpdateLoopHead(loopInd, h, newH); + assert((h->bbFlags & BBF_LOOP_PREHEADER) != 0); - fgAddRefPred(newH, h); // Add h->newH pred edge - JITDUMP("Adding " FMT_BB " -> " FMT_BB "\n", h->bbNum, newH->bbNum); - fgReplacePred(newH->bbNext, h, newH); // Replace pred in COND fall-through block. - JITDUMP("Replace " FMT_BB " -> " FMT_BB " with " FMT_BB " -> " FMT_BB "\n", h->bbNum, newH->bbNext->bbNum, - newH->bbNum, newH->bbNext->bbNum); + // Make a new pre-header block 'h2' for the loop. 'h' will fall through to 'h2'. + JITDUMP("Create new header block for loop\n"); + + BasicBlock* h2 = fgNewBBafter(BBJ_NONE, h, /*extendRegion*/ true); + JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", h2->bbNum, h->bbNum); + h2->bbWeight = h2->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; + h2->bbNatLoopNum = ambientLoop; + h2->bbFlags |= BBF_LOOP_PREHEADER; - h = newH; + if (h->bbJumpKind != BBJ_NONE) + { + assert(h->bbJumpKind == BBJ_ALWAYS); + assert(h->bbJumpDest == loop.lpEntry); + h2->bbJumpKind = BBJ_ALWAYS; + h2->bbJumpDest = loop.lpEntry; } - assert(h == loop.lpHead); + + fgReplacePred(loop.lpEntry, h, h2); + JITDUMP("Replace " FMT_BB " -> " FMT_BB " with " FMT_BB " -> " FMT_BB "\n", h->bbNum, loop.lpEntry->bbNum, + h2->bbNum, loop.lpEntry->bbNum); + + // 'h' is no longer the loop head; 'h2' is! + h->bbFlags &= ~BBF_LOOP_PREHEADER; + optUpdateLoopHead(loopInd, h, h2); + + // Make 'h' fall through to 'h2' (if it didn't already). + // Don't add the h->h2 edge because we're going to insert the cloning conditions between 'h' and 'h2', and + // optInsertLoopChoiceConditions() will add the edge. + h->bbJumpKind = BBJ_NONE; + h->bbJumpDest = nullptr; // Make X2 after B, if necessary. (Not necessary if B is a BBJ_ALWAYS.) // "newPred" will be the predecessor of the blocks of the cloned loop. @@ -2168,41 +2100,21 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) } } - // We're going to create a new loop head for the slow loop immediately before the slow loop itself. All failed - // conditions will branch to the slow head. The slow head will either fall through to the entry, or unconditionally - // branch to the slow path entry. This puts the slow loop in the canonical loop form. - BasicBlock* slowHeadPrev = newPred; - - // Now we'll make "h2", after "h" to go to "e" -- unless the loop is a do-while, - // so that "h" already falls through to "e" (e == t). - // It might look like this code is unreachable, since "h" must be a BBJ_ALWAYS, but - // later we will change "h" to a BBJ_COND along with a set of loop conditions. - // Cloning is currently restricted to "do-while" loop forms, where this case won't occur. - // However, it can occur in OSR methods. - BasicBlock* h2 = nullptr; - if (h->bbNext != loop.lpEntry) - { - assert(h->bbJumpKind == BBJ_ALWAYS); - JITDUMP("Create branch to entry of optimized loop\n"); - h2 = fgNewBBafter(BBJ_ALWAYS, h, /*extendRegion*/ true); - JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", h2->bbNum, h->bbNum); - h2->bbWeight = h2->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; - - // This is in the scope of a surrounding loop, if one exists -- the parent of the loop we're cloning. - h2->bbNatLoopNum = ambientLoop; - - h2->bbJumpDest = loop.lpEntry; - fgReplacePred(loop.lpEntry, h, h2); - JITDUMP("Replace " FMT_BB " -> " FMT_BB " with " FMT_BB " -> " FMT_BB "\n", h->bbNum, loop.lpEntry->bbNum, - h2->bbNum, loop.lpEntry->bbNum); - - optUpdateLoopHead(loopInd, h, h2); - - // NOTE: 'h' is no longer the loop head; 'h2' is! - } + // Create a new loop head for the slow loop immediately before the slow loop itself. All failed + // conditions will branch to the slow head. The slow head will either fall through or unconditionally + // branch to the slow path entry. The slowHead will be a loop pre-header, however we don't put the slow loop + // in the loop table, so it isn't marked as a pre-header (if we re-built the loop table after cloning, slowHead + // would end up in the loop table as a pre-header without creating any new pre-header block). This puts the + // slow loop in the canonical loop form. + JITDUMP("Create unique head block for slow path loop\n"); + BasicBlock* slowHead = fgNewBBafter(BBJ_NONE, newPred, /*extendRegion*/ true); + JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", slowHead->bbNum, newPred->bbNum); + slowHead->bbWeight = newPred->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; + slowHead->scaleBBWeight(slowPathWeightScaleFactor); + slowHead->bbNatLoopNum = ambientLoop; + newPred = slowHead; // Now we'll clone the blocks of the loop body. These cloned blocks will be the slow path. - BasicBlock* newFirst = nullptr; BlockToBlockMap* blockMap = new (getAllocator(CMK_LoopClone)) BlockToBlockMap(getAllocator(CMK_LoopClone)); for (BasicBlock* const blk : loop.LoopBlocks()) @@ -2229,22 +2141,28 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) // If the original loop is aligned, do not align the cloned loop because cloned loop will be executed in // rare scenario. Additionally, having to align cloned loop will force us to disable some VEX prefix encoding // and adding compensation for over-estimated instructions. - if (blk->isLoopAlign()) + if (newBlk->isLoopAlign()) { newBlk->bbFlags &= ~BBF_LOOP_ALIGN; JITDUMP("Removing LOOP_ALIGN flag from cloned loop in " FMT_BB "\n", newBlk->bbNum); } #endif + // If the loop we're cloning contains a nested loop, we need to clear the pre-header bit on + // the nested loop pre-header block, since it will no longer be a loop pre-header. (This is because + // we don't add the slow loop or its child loops to the loop table. It would be simplest to + // just re-build the loop table if we want to enable loop optimization of the slow path loops.) + if ((newBlk->bbFlags & BBF_LOOP_PREHEADER) != 0) + { + JITDUMP("Removing BBF_LOOP_PREHEADER flag from nested cloned loop block " FMT_BB "\n", newBlk->bbNum); + newBlk->bbFlags &= ~BBF_LOOP_PREHEADER; + } + // TODO-Cleanup: The above clones the bbNatLoopNum, which is incorrect. Eventually, we should probably insert // the cloned loop in the loop table. For now, however, we'll just make these blocks be part of the surrounding // loop, if one exists -- the parent of the loop we're cloning. newBlk->bbNatLoopNum = loop.lpParent; - if (newFirst == nullptr) - { - newFirst = newBlk; - } newPred = newBlk; blockMap->Set(blk, newBlk); } @@ -2321,44 +2239,25 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) // !cond1 -?> slowHead // ... // !condn -?> slowHead - // h2/entry (fast) + // h2 -?> e (fast loop pre-header branch or fall-through to e) // ... - // slowHead -?> e2 (slowHead) branch or fall-through to e2 + // slowHead -?> e2 (slow loop pre-header branch or fall-through to e2) // // We should always have block conditions. assert(context->HasBlockConditions(loopInd)); - - if (h->bbJumpKind == BBJ_NONE) - { - assert(h->bbNext == loop.lpEntry); - fgRemoveRefPred(h->bbNext, h); - } - else - { - assert(h->bbJumpKind == BBJ_ALWAYS); - assert(h->bbJumpDest == loop.lpEntry); - assert(h2 != nullptr); - h->bbJumpKind = BBJ_NONE; - h->bbJumpDest = nullptr; - } + assert(h->bbJumpKind == BBJ_NONE); + assert(h->bbNext == h2); // If any condition is false, go to slowHead (which branches or falls through to e2). BasicBlock* e2 = nullptr; bool foundIt = blockMap->Lookup(loop.lpEntry, &e2); assert(foundIt && e2 != nullptr); - // Create a unique header for the slow path. - JITDUMP("Create unique head block for slow path loop\n"); - BasicBlock* slowHead = fgNewBBafter(BBJ_NONE, slowHeadPrev, /*extendRegion*/ true); - JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", slowHead->bbNum, slowHeadPrev->bbNum); - slowHead->bbWeight = slowHeadPrev->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; - slowHead->scaleBBWeight(slowPathWeightScaleFactor); - slowHead->bbNatLoopNum = ambientLoop; - if (slowHead->bbNext != e2) { // We can't just fall through to the slow path entry, so make it an unconditional branch. + assert(slowHead->bbJumpKind == BBJ_NONE); // This is how we created it above. slowHead->bbJumpKind = BBJ_ALWAYS; slowHead->bbJumpDest = e2; } @@ -2374,12 +2273,6 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) JITDUMP("Adding " FMT_BB " -> " FMT_BB "\n", condLast->bbNum, condLast->bbNext->bbNum); fgAddRefPred(condLast->bbNext, condLast); - // If h2 is present it is already the head. Else, replace 'h' as the loop head by 'condLast'. - if (h2 == nullptr) - { - optUpdateLoopHead(loopInd, loop.lpHead, condLast); - } - // Don't unroll loops that we've cloned -- the unroller expects any loop it should unroll to // initialize the loop counter immediately before entering the loop, but we've left a shared // initialization of the loop counter up above the test that determines which version of the From 07e66b237f38ccce6800031fc8b077fba58cd638 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Wed, 29 Mar 2023 14:32:10 -0700 Subject: [PATCH 08/14] Remove special case pre-header handling in hoisting --- src/coreclr/jit/compiler.h | 6 +- src/coreclr/jit/fgopt.cpp | 2 +- src/coreclr/jit/optimizer.cpp | 136 ++++++++++++++++------------------ 3 files changed, 67 insertions(+), 77 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index d1fab2fc7222cb..0400d78bd661f5 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -5232,7 +5232,7 @@ class Compiler void vnPrint(ValueNum vn, unsigned level); #endif - bool fgDominate(BasicBlock* b1, BasicBlock* b2); // Return true if b1 dominates b2 + bool fgDominate(const BasicBlock* b1, const BasicBlock* b2); // Return true if b1 dominates b2 // Dominator computation member functions // Not exposed outside Compiler @@ -6193,7 +6193,7 @@ class Compiler bool optHoistLoopNest(unsigned lnum, LoopHoistContext* hoistCtxt); // Do hoisting for a particular loop ("lnum" is an index into the optLoopTable.) - bool optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, BasicBlockList* existingPreHeaders); + bool optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt); // Hoist all expressions in "blocks" that are invariant in loop "loopNum" (an index into the optLoopTable) // outside of that loop. @@ -6338,8 +6338,6 @@ class Compiler int lpLoopVarFPCount; // The register count for the FP LclVars that are read/written inside this loop int lpVarInOutFPCount; // The register count for the FP LclVars that are alive inside or across this loop - bool lpHoistAddedPreheader; // The loop preheader was added during hoisting - typedef JitHashTable, FieldKindForVN> FieldHandleSet; FieldHandleSet* lpFieldsModified; // This has entries for all static field and object instance fields modified diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 60300c1ef02268..6b6a421601a18e 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -26,7 +26,7 @@ // Assumptions: // -- Dominators have been calculated (`fgDomsComputed` is true). // -bool Compiler::fgDominate(BasicBlock* b1, BasicBlock* b2) +bool Compiler::fgDominate(const BasicBlock* b1, const BasicBlock* b2) { noway_assert(fgDomsComputed); diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index ff3e9c043a75f2..75e197d0ec64f4 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -6644,7 +6644,7 @@ void Compiler::optPerformHoistExpr(GenTree* origExpr, BasicBlock* exprBb, unsign } //------------------------------------------------------------------------ -// optHoistLoopNest: run loop hoisting for indicated loop and all contained loops +// optHoistLoopCode: run loop hoisting phase // // Returns: // suitable phase status @@ -6777,9 +6777,6 @@ bool Compiler::optHoistLoopNest(unsigned lnum, LoopHoistContext* hoistCtxt) m_loopsConsidered++; #endif // LOOP_HOIST_STATS - BasicBlockList* firstPreHeader = nullptr; - BasicBlockList** preHeaderAppendPtr = &firstPreHeader; - bool modified = false; if (optLoopTable[lnum].lpChild != BasicBlock::NOT_IN_LOOP) @@ -6788,30 +6785,10 @@ bool Compiler::optHoistLoopNest(unsigned lnum, LoopHoistContext* hoistCtxt) child = optLoopTable[child].lpSibling) { modified |= optHoistLoopNest(child, hoistCtxt); - - if (optLoopTable[child].lpFlags & LPFLG_HAS_PREHEAD) - { - // If a pre-header is found, add it to the tracking list. Most likely, the recursive call - // to hoist from the `child` loop nest found something to hoist and created a pre-header - // where the hoisted code was placed. - - BasicBlock* preHeaderBlock = optLoopTable[child].lpHead; - JITDUMP(" PREHEADER: " FMT_BB "\n", preHeaderBlock->bbNum); - - // Here, we are arranging the blocks in reverse lexical order, so when they are removed from the - // head of this list and pushed on the stack of hoisting blocks to process, they are in - // forward lexical order when popped. Note that for child loops `i` and `j`, in order `i` - // followed by `j` in the `lpSibling` list, that `j` is actually first in lexical order - // and that makes these blocks arranged in reverse lexical order in this list. - // That is, the sibling list is in reverse lexical order. - // Note: the sibling list order should not matter to any algorithm; this order is not guaranteed. - *preHeaderAppendPtr = new (this, CMK_LoopHoist) BasicBlockList(preHeaderBlock, nullptr); - preHeaderAppendPtr = &((*preHeaderAppendPtr)->next); - } } } - modified |= optHoistThisLoop(lnum, hoistCtxt, firstPreHeader); + modified |= optHoistThisLoop(lnum, hoistCtxt); return modified; } @@ -6822,12 +6799,11 @@ bool Compiler::optHoistLoopNest(unsigned lnum, LoopHoistContext* hoistCtxt) // Arguments: // lnum - loop to process // hoistCtxt - context for the hoisting -// existingPreHeaders - list of preheaders created for child loops // // Returns: // true if any hoisting was done // -bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, BasicBlockList* existingPreHeaders) +bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt) { LoopDsc* pLoopDsc = &optLoopTable[lnum]; @@ -6854,10 +6830,9 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi VARSET_TP loopVars(VarSetOps::Intersection(this, pLoopDsc->lpVarInOut, pLoopDsc->lpVarUseDef)); - pLoopDsc->lpVarInOutCount = VarSetOps::Count(this, pLoopDsc->lpVarInOut); - pLoopDsc->lpLoopVarCount = VarSetOps::Count(this, loopVars); - pLoopDsc->lpHoistedExprCount = 0; - pLoopDsc->lpHoistAddedPreheader = false; + pLoopDsc->lpVarInOutCount = VarSetOps::Count(this, pLoopDsc->lpVarInOut); + pLoopDsc->lpLoopVarCount = VarSetOps::Count(this, loopVars); + pLoopDsc->lpHoistedExprCount = 0; #ifndef TARGET_64BIT @@ -6933,29 +6908,72 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi // Until we have post-dominators, we'll special-case for single-exit blocks. // // Todo: it is not clear if this is a correctness requirement or a profitability heuristic. - // It seems like the latter. Ideally have enough safeguards to prevent hoisting exception - // or side-effect dependent things. + // It seems like the latter. Ideally there are enough safeguards to prevent hoisting exception + // or side-effect dependent things. Note that HoistVisitor uses `m_beforeSideEffect` to determine if it's + // ok to hoist a side-effect. It allows this only for the first block (the entry block), before any + // side-effect has been seen. After the first block, it assumes that there has been a side effect and + // no further side-effect can be hoisted. It is true that we don't analyze any program behavior in the + // flow graph between the entry block and the subsequent blocks, whether they be the next block dominating + // the exit block, or the pre-headers of nested loops. // // We really should consider hoisting from conditionally executed blocks, if they are frequently executed // and it is safe to evaluate the tree early. // ArrayStack defExec(getAllocatorLoopHoist()); - BasicBlockList* preHeadersList = existingPreHeaders; + + // Add the pre-headers of any child loops to the list of blocks to consider for hoisting. + // Note that these are not definitely executed. However, it is a heuristic that they will + // often provide good opportunities for further hoisting since we hoist from inside-out, + // and the inner loop may have already hoisted something loop-invariant to them. They may + // also be blocks which dominate the exit block and hence get added again, below. + // + // Note that all pre-headers get added first, which means they get considered for hoisting last. It is + // assumed that the order does not matter. + + int childLoopPreHeaders = 0; + for (BasicBlock::loopNumber childLoop = pLoopDsc->lpChild; // + childLoop != BasicBlock::NOT_IN_LOOP; // + childLoop = optLoopTable[childLoop].lpSibling) + { + if (optLoopTable[childLoop].lpIsRemoved()) + { + continue; + } + INDEBUG(optLoopTable[childLoop].lpValidatePreHeader()); + BasicBlock* childPreHead = optLoopTable[childLoop].lpHead; + if (pLoopDsc->lpExitCnt == 1) + { + if (fgDominate(childPreHead, pLoopDsc->lpExit)) + { + // If the child loop pre-header dominates the exit, it will get added in the dominator tree + // loop below. + continue; + } + } + else + { + // If the child loop pre-header is the loop entry for a multi-exit loop, it will get added below. + if (childPreHead == pLoopDsc->lpEntry) + { + continue; + } + } + JITDUMP(" -- " FMT_BB " (child loop pre-header)\n", childPreHead->bbNum); + defExec.Push(childPreHead); + ++childLoopPreHeaders; + } if (pLoopDsc->lpExitCnt == 1) { assert(pLoopDsc->lpExit != nullptr); JITDUMP(" Considering hoisting in blocks that either dominate exit block " FMT_BB - " or preheaders of nested loops, if any:\n", + ", or pre-headers of nested loops, if any:\n", pLoopDsc->lpExit->bbNum); // Push dominators, until we reach "entry" or exit the loop. - // Also push the preheaders that were added for the nested loops, - // if any, along the way such that in the final list, the dominating - // blocks are visited before the dominated blocks. - // - // TODO-CQ: In future, we should create preheaders upfront before building - // dominators so we don't have to do this extra work here. + // Also push the pre-headers for the nested loops, if any. The pre-headers are added first, meaning + // they are processed last (after the dominator tree). + // (TODO: is there a correctness reason for any particular ordering?) BasicBlock* cur = pLoopDsc->lpExit; while (cur != nullptr && pLoopDsc->lpContains(cur) && cur != pLoopDsc->lpEntry) @@ -6963,22 +6981,6 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi JITDUMP(" -- " FMT_BB " (dominate exit block)\n", cur->bbNum); defExec.Push(cur); cur = cur->bbIDom; - - for (; preHeadersList != nullptr; preHeadersList = preHeadersList->next) - { - BasicBlock* preHeaderBlock = preHeadersList->block; - BasicBlock* lpEntry = optLoopEntry(preHeaderBlock); - if (cur->bbNum < lpEntry->bbNum) - { - JITDUMP(" -- " FMT_BB " (preheader of " FMT_LP ")\n", preHeaderBlock->bbNum, - lpEntry->bbNatLoopNum); - defExec.Push(preHeaderBlock); - } - else - { - break; - } - } } // If we didn't reach the entry block, give up and *just* push the entry block (and the pre-headers). @@ -6987,8 +6989,8 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi JITDUMP(" -- odd, we didn't reach entry from exit via dominators. Only considering hoisting in entry " "block " FMT_BB "\n", pLoopDsc->lpEntry->bbNum); - defExec.Reset(); - preHeadersList = existingPreHeaders; + defExec.Pop(defExec.Height() - childLoopPreHeaders); + assert(defExec.Height() == childLoopPreHeaders); } } else // More than one exit @@ -7000,22 +7002,13 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt, Basi pLoopDsc->lpEntry->bbNum, lnum); } - // Push the remaining preheaders, if any. - for (; preHeadersList != nullptr; preHeadersList = preHeadersList->next) - { - BasicBlock* preHeaderBlock = preHeadersList->block; - JITDUMP(" -- " FMT_BB " (preheader of " FMT_LP ")\n", preHeaderBlock->bbNum, - optLoopEntry(preHeaderBlock)->bbNatLoopNum); - defExec.Push(preHeaderBlock); - } - JITDUMP(" -- " FMT_BB " (entry block)\n", pLoopDsc->lpEntry->bbNum); defExec.Push(pLoopDsc->lpEntry); optHoistLoopBlocks(lnum, &defExec, hoistCtxt); const unsigned numHoisted = pLoopDsc->lpHoistedFPExprCount + pLoopDsc->lpHoistedExprCount; - return pLoopDsc->lpHoistAddedPreheader || (numHoisted > 0); + return numHoisted > 0; } bool Compiler::optIsProfitableToHoistTree(GenTree* tree, unsigned lnum) @@ -7878,9 +7871,8 @@ void Compiler::optHoistCandidate(GenTree* tree, BasicBlock* treeBb, unsigned lnu return; } - // Create a loop pre-header in which to put the hoisted code. - const bool newPreheader = fgCreateLoopPreHeader(lnum); - optLoopTable[lnum].lpHoistAddedPreheader |= newPreheader; + // We should already have a pre-header for the loop. + INDEBUG(optLoopTable[lnum].lpValidatePreHeader()); // If the block we're hoisting from and the pre-header are in different EH regions, don't hoist. // TODO: we could probably hoist things that won't raise exceptions, such as constants. From 0a45df2c46e4b3796bb18de2453672d405fe24a1 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Wed, 29 Mar 2023 14:39:46 -0700 Subject: [PATCH 09/14] Remove unused SSA update code in fgCreateLoopPreHeader --- src/coreclr/jit/optimizer.cpp | 43 ++++------------------------------- 1 file changed, 5 insertions(+), 38 deletions(-) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 75e197d0ec64f4..039aa3ec4b0015 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -8029,7 +8029,7 @@ bool Compiler::optVNIsLoopInvariant(ValueNum vn, unsigned lnum, VNSet* loopVnInv // lnum - loop index // // Returns: -// true if new preheader was created +// true if new pre-header was created // bool Compiler::fgCreateLoopPreHeader(unsigned lnum) { @@ -8042,7 +8042,7 @@ bool Compiler::fgCreateLoopPreHeader(unsigned lnum) LoopDsc& loop = optLoopTable[lnum]; - // Have we already created a loop-preheader block? + // Have we already created a loop pre-header block? if (loop.lpFlags & LPFLG_HAS_PREHEAD) { @@ -8051,6 +8051,9 @@ bool Compiler::fgCreateLoopPreHeader(unsigned lnum) return false; } + // Assert that we haven't created SSA. It is assumed that we create all loop pre-headers before building SSA. + assert(fgSsaPassesCompleted == 0); + BasicBlock* head = loop.lpHead; BasicBlock* top = loop.lpTop; BasicBlock* entry = loop.lpEntry; @@ -8232,42 +8235,6 @@ bool Compiler::fgCreateLoopPreHeader(unsigned lnum) // Link in the preHead block fgInsertBBbefore(top, preHead); - // Ideally we would re-run SSA and VN if we optimized by doing loop hoisting. - // However, that is too expensive at this point. Instead, we update the phi - // node block references, if we created pre-header block due to hoisting. - // This is sufficient because any definition participating in SSA that flowed - // into the phi via the loop header block will now flow through the preheader - // block from the header block. - // TODO: if we always create and maintain pre-headers before SSA, can we delete this? - - // Assert that we haven't created SSA. It is assumed that we create all loop pre-headers before - // building SSA. - assert(fgSsaPassesCompleted == 0); - -#if 0 - for (Statement* const stmt : top->Statements()) - { - GenTree* tree = stmt->GetRootNode(); - if (tree->OperGet() != GT_ASG) - { - break; - } - GenTree* op2 = tree->gtGetOp2(); - if (op2->OperGet() != GT_PHI) - { - break; - } - for (GenTreePhi::Use& use : op2->AsPhi()->Uses()) - { - GenTreePhiArg* phiArg = use.GetNode()->AsPhiArg(); - if (phiArg->gtPredBB == head) - { - phiArg->gtPredBB = preHead; - } - } - } -#endif // 0 - // In which EH region should the pre-header live? // // The pre-header block is added immediately before `top`. From 19744dad8bdf7f74a11e42dec24b01ad3a35cef2 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Wed, 29 Mar 2023 14:55:36 -0700 Subject: [PATCH 10/14] Remove unneeded pre-header code from fgDominate --- src/coreclr/jit/fgopt.cpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 6b6a421601a18e..ddfa21b49cbb9a 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -60,17 +60,6 @@ bool Compiler::fgDominate(const BasicBlock* b1, const BasicBlock* b2) if (b1->bbNum > fgDomBBcount) { - // If b1 is a loop preheader (that was created after the dominators were calculated), - // then it has a single successor that is the loop entry, and it is the only non-loop - // predecessor of the loop entry. Thus, b1 dominates the loop entry and also dominates - // what the loop entry dominates. - if (b1->bbFlags & BBF_LOOP_PREHEADER) - { - BasicBlock* loopEntry = b1->GetUniqueSucc(); - assert(loopEntry != nullptr); - return fgDominate(loopEntry, b2); - } - // unknown dominators; err on the safe side and return false return false; } From c9912cd62205e5f2af4187dbc32fc5904b19d06e Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Wed, 29 Mar 2023 15:16:06 -0700 Subject: [PATCH 11/14] Remove workaround to avoid extraneous LSRA diffs due to bbNum ordering --- src/coreclr/jit/compiler.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index f1ee62d702ba18..b9199d49b646cc 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -5104,9 +5104,6 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // Now that lowering is completed we can proceed to perform register allocation // - // // LSRA block ordering depends on bbNum. Renumber the blocks, if necessary, before LSRA. - // fgDomsComputed = false; - // fgRenumberBlocks(); auto linearScanPhase = [this]() { m_pLinearScan->doLinearScan(); }; DoPhase(this, PHASE_LINEAR_SCAN, linearScanPhase); From dddb6929fb123179e35d13d1e33fb05dd9dd8eb1 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Wed, 29 Mar 2023 15:30:52 -0700 Subject: [PATCH 12/14] Update comments --- src/coreclr/jit/optimizer.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 039aa3ec4b0015..1bd646efe3c2c6 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -6924,11 +6924,14 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt) // Add the pre-headers of any child loops to the list of blocks to consider for hoisting. // Note that these are not definitely executed. However, it is a heuristic that they will // often provide good opportunities for further hoisting since we hoist from inside-out, - // and the inner loop may have already hoisted something loop-invariant to them. They may - // also be blocks which dominate the exit block and hence get added again, below. + // and the inner loop may have already hoisted something loop-invariant to them. If the child + // loop pre-header block would be added anyway (by dominating the loop exit block), we don't + // add it here, and let it be added naturally, below. // // Note that all pre-headers get added first, which means they get considered for hoisting last. It is - // assumed that the order does not matter. + // assumed that the order does not matter for correctness (since there is no execution order known). + // Note that the order does matter for the hoisting profitability heuristics, as we might + // run out of hoisting budget when processing the blocks. int childLoopPreHeaders = 0; for (BasicBlock::loopNumber childLoop = pLoopDsc->lpChild; // @@ -6971,9 +6974,6 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt) pLoopDsc->lpExit->bbNum); // Push dominators, until we reach "entry" or exit the loop. - // Also push the pre-headers for the nested loops, if any. The pre-headers are added first, meaning - // they are processed last (after the dominator tree). - // (TODO: is there a correctness reason for any particular ordering?) BasicBlock* cur = pLoopDsc->lpExit; while (cur != nullptr && pLoopDsc->lpContains(cur) && cur != pLoopDsc->lpEntry) From df7431f665b13880f0305aed3ae16c64b3cc8cb5 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Tue, 4 Apr 2023 17:04:21 -0700 Subject: [PATCH 13/14] Improve loop table rebuilding with pre-headers When the loop table is built, it looks around for various loop patterns, including looking for a guaranteed-executed, pre-loop constant initializer. This is used in loop cloning and loop unrolling. It needs to look "a little harder" in the case we created loop pre-headers, then rebuild the loop table (currently, only due to loop unrolling of loops that contain nested loops). The new code only allows for empty pre-headers. This works since in our current phase ordering, no hoisting happens by the time the loop table is rebuilt. (Actually, it's currently not necessary to do this at all, since the constant initializer info is only used by cloning and loop unrolling, both of which have finished by the time the loop table is rebuilt. However, we might someday choose to rebuild the loop table after cloning and before unrolling, at which point it would be necessary.) --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/optimizer.cpp | 86 +++++++++++++++++++++++------------ 2 files changed, 59 insertions(+), 29 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 0400d78bd661f5..6fd8e1442f7d18 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6573,7 +6573,7 @@ class Compiler bool optComputeIterInfo(GenTree* incr, BasicBlock* from, BasicBlock* to, unsigned* pIterVar); bool optPopulateInitInfo(unsigned loopInd, BasicBlock* initBlock, GenTree* init, unsigned iterVar); bool optExtractInitTestIncr( - BasicBlock* head, BasicBlock* bottom, BasicBlock* exit, GenTree** ppInit, GenTree** ppTest, GenTree** ppIncr); + BasicBlock** pInitBlock, BasicBlock* bottom, BasicBlock* exit, GenTree** ppInit, GenTree** ppTest, GenTree** ppIncr); void optFindNaturalLoops(); diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 1bd646efe3c2c6..1939447014dc3c 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -808,15 +808,32 @@ bool Compiler::optPopulateInitInfo(unsigned loopInd, BasicBlock* initBlock, GenT return false; } - // We found an initializer in the `head` block. For this to be used, we need to make sure the + // We found an initializer in the `initBlock` block. For this to be used, we need to make sure the // "iterVar" initialization is never skipped. That is, every pred of ENTRY other than HEAD is in the loop. + // We allow one special case: the HEAD block is an empty predecessor to ENTRY, and the initBlock is the + // only predecessor to HEAD. This handles the case where we rebuild the loop table (after inserting + // pre-headers) and we still want to find the initializer before the pre-header block. for (BasicBlock* const predBlock : optLoopTable[loopInd].lpEntry->PredBlocks()) { - if ((predBlock != initBlock) && !optLoopTable[loopInd].lpContains(predBlock)) + if (!optLoopTable[loopInd].lpContains(predBlock)) { - JITDUMP(FMT_LP ": initialization not guaranteed through `head` block; ignore constant initializer\n", - loopInd); - return false; + bool initBlockOk = (predBlock == initBlock); + if (!initBlockOk) + { + if ((predBlock->bbJumpKind == BBJ_NONE) && (predBlock->bbNext == optLoopTable[loopInd].lpEntry) && + (predBlock->countOfInEdges() == 1) && (predBlock->firstStmt() == nullptr) && + (predBlock->bbPrev != nullptr) && predBlock->bbPrev->bbFallsThrough()) + { + initBlockOk = true; + } + } + if (!initBlockOk) + { + JITDUMP(FMT_LP ": initialization not guaranteed from " FMT_BB " through to entry block " FMT_BB + " from pred " FMT_BB "; ignore constant initializer\n", + loopInd, initBlock->bbNum, optLoopTable[loopInd].lpEntry->bbNum, predBlock->bbNum); + return false; + } } } @@ -1132,12 +1149,13 @@ bool Compiler::optIsLoopTestEvalIntoTemp(Statement* testStmt, Statement** newTes // Extract the "init", "test" and "incr" nodes of the loop. // // Arguments: -// head - Loop head block -// bottom - Loop bottom block -// top - Loop top block -// ppInit - The init stmt of the loop if found. -// ppTest - The test stmt of the loop if found. -// ppIncr - The incr stmt of the loop if found. +// pInitBlock - [IN/OUT] *pInitBlock is the loop head block on entry, and is set to the initBlock on exit, +// if `**ppInit` is non-null. +// bottom - Loop bottom block +// top - Loop top block +// ppInit - The init stmt of the loop if found. +// ppTest - The test stmt of the loop if found. +// ppIncr - The incr stmt of the loop if found. // // Return Value: // The results are put in "ppInit", "ppTest" and "ppIncr" if the method @@ -1146,9 +1164,8 @@ bool Compiler::optIsLoopTestEvalIntoTemp(Statement* testStmt, Statement** newTes // to nullptr. Return value will never be false if `init` is not found. // // Operation: -// Check if the "test" stmt is last stmt in the loop "bottom". If found good, -// "test" stmt is found. Try to find the "incr" stmt. Check previous stmt of -// "test" to get the "incr" stmt. If it is not found it could be a loop of the +// Check if the "test" stmt is last stmt in the loop "bottom". Try to find the "incr" stmt. +// Check previous stmt of "test" to get the "incr" stmt. If it is not found it could be a loop of the // below form. // // +-------<-----------------<-----------+ @@ -1165,8 +1182,9 @@ bool Compiler::optIsLoopTestEvalIntoTemp(Statement* testStmt, Statement** newTes // the callers are expected to verify that "iterVar" is used in the test. // bool Compiler::optExtractInitTestIncr( - BasicBlock* head, BasicBlock* bottom, BasicBlock* top, GenTree** ppInit, GenTree** ppTest, GenTree** ppIncr) + BasicBlock** pInitBlock, BasicBlock* bottom, BasicBlock* top, GenTree** ppInit, GenTree** ppTest, GenTree** ppIncr) { + assert(pInitBlock != nullptr); assert(ppInit != nullptr); assert(ppTest != nullptr); assert(ppIncr != nullptr); @@ -1218,7 +1236,22 @@ bool Compiler::optExtractInitTestIncr( // Find the last statement in the loop pre-header which we expect to be the initialization of // the loop iterator. - Statement* phdrStmt = head->firstStmt(); + BasicBlock* initBlock = *pInitBlock; + Statement* phdrStmt = initBlock->firstStmt(); + if (phdrStmt == nullptr) + { + // When we build the loop table, we canonicalize by introducing loop pre-headers for all loops. + // If we are rebuilding the loop table, we would already have the pre-header block introduced + // the first time, which might be empty if no hoisting has yet occurred. In this case, look a + // little harder for the possible loop initialization statement. + if ((initBlock->bbJumpKind == BBJ_NONE) && (initBlock->bbNext == top) && (initBlock->countOfInEdges() == 1) && + (initBlock->bbPrev != nullptr) && initBlock->bbPrev->bbFallsThrough()) + { + initBlock = initBlock->bbPrev; + phdrStmt = initBlock->firstStmt(); + } + } + if (phdrStmt != nullptr) { Statement* initStmt = phdrStmt->GetPrevStmt(); @@ -1235,11 +1268,6 @@ bool Compiler::optExtractInitTestIncr( // statements other than duplicated loop conditions. doGetPrev = (initStmt->GetPrevStmt() != nullptr); } - else - { - // Must be a duplicated loop condition. - noway_assert(initStmt->GetRootNode()->gtOper == GT_JTRUE); - } #endif // DEBUG if (doGetPrev) { @@ -1248,7 +1276,8 @@ bool Compiler::optExtractInitTestIncr( noway_assert(initStmt != nullptr); } - *ppInit = initStmt->GetRootNode(); + *ppInit = initStmt->GetRootNode(); + *pInitBlock = initBlock; } else { @@ -1372,24 +1401,25 @@ bool Compiler::optRecordLoop( // if (bottom->bbJumpKind == BBJ_COND) { - GenTree* init; - GenTree* test; - GenTree* incr; - if (!optExtractInitTestIncr(head, bottom, top, &init, &test, &incr)) + GenTree* init; + GenTree* test; + GenTree* incr; + BasicBlock* initBlock = head; + if (!optExtractInitTestIncr(&initBlock, bottom, top, &init, &test, &incr)) { JITDUMP(FMT_LP ": couldn't find init/test/incr; not LPFLG_ITER loop\n", loopInd); goto DONE_LOOP; } unsigned iterVar = BAD_VAR_NUM; - if (!optComputeIterInfo(incr, head->bbNext, bottom, &iterVar)) + if (!optComputeIterInfo(incr, top, bottom, &iterVar)) { JITDUMP(FMT_LP ": increment expression not appropriate form, or not loop invariant; not LPFLG_ITER loop\n", loopInd); goto DONE_LOOP; } - optPopulateInitInfo(loopInd, head, init, iterVar); + optPopulateInitInfo(loopInd, initBlock, init, iterVar); // Check that the iterator is used in the loop condition. if (!optCheckIterInLoopTest(loopInd, test, iterVar)) From cb7f42f2fa03ed14f05c2310e96642579f6e3cf4 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Tue, 4 Apr 2023 18:52:06 -0700 Subject: [PATCH 14/14] Update comments --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/fgopt.cpp | 10 +++++++--- src/coreclr/jit/loopcloning.cpp | 4 ++-- src/coreclr/jit/optimizer.cpp | 33 ++++++++++++++++++++++++++++----- 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 6fd8e1442f7d18..b227a344410f72 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6511,7 +6511,7 @@ class Compiler public: LoopDsc* optLoopTable; // loop descriptor table bool optLoopTableValid; // info in loop table should be valid - bool optLoopsRequirePreHeaders; // Do we require that all loops have pre-headers? + bool optLoopsRequirePreHeaders; // Do we require that all loops (in the loop table) have pre-headers? unsigned char optLoopCount; // number of tracked loops unsigned char loopAlignCandidates; // number of loops identified for alignment diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index ddfa21b49cbb9a..84e6ce0ef21d76 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -1937,8 +1937,8 @@ bool Compiler::fgCanCompactBlocks(BasicBlock* block, BasicBlock* bNext) } // Don't allow removing an empty loop pre-header. - // We can compact a pre-header into an empty `block`, as long as we propagate the BBF_LOOP_PREHEADER - // bit properly. + // We can compact a pre-header `bNext` into an empty `block` since BBF_COMPACT_UPD propagates + // BBF_LOOP_PREHEADER to `block`. if (optLoopsRequirePreHeaders) { if (((block->bbFlags & BBF_LOOP_PREHEADER) != 0) && (bNext->countOfInEdges() != 1)) @@ -2056,7 +2056,11 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext) JITDUMP("Second block has %u other incoming edges\n", bNext->countOfInEdges()); assert(block->isEmpty()); - // `block` can no longer be a loop pre-header (if it was before). + // When loops require pre-headers, `block` cannot be a pre-header. + // We should have screened this out in fgCanCompactBlocks(). + // + // When pre-headers are not required, then if `block` was a pre-header, + // it no longer is. // assert(!optLoopsRequirePreHeaders || ((block->bbFlags & BBF_LOOP_PREHEADER) == 0)); block->bbFlags &= ~BBF_LOOP_PREHEADER; diff --git a/src/coreclr/jit/loopcloning.cpp b/src/coreclr/jit/loopcloning.cpp index 681df8e88394fa..0f28c613470919 100644 --- a/src/coreclr/jit/loopcloning.cpp +++ b/src/coreclr/jit/loopcloning.cpp @@ -2148,8 +2148,8 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) } #endif - // If the loop we're cloning contains a nested loop, we need to clear the pre-header bit on - // the nested loop pre-header block, since it will no longer be a loop pre-header. (This is because + // If the loop we're cloning contains nested loops, we need to clear the pre-header bit on + // any nested loop pre-header blocks, since they will no longer be loop pre-headers. (This is because // we don't add the slow loop or its child loops to the loop table. It would be simplest to // just re-build the loop table if we want to enable loop optimization of the slow path loops.) if ((newBlk->bbFlags & BBF_LOOP_PREHEADER) != 0) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 1939447014dc3c..96790a373513c8 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2718,7 +2718,7 @@ void Compiler::optFindNaturalLoops() fgUpdateChangedFlowGraph(FlowGraphUpdates::COMPUTE_DOMS); } - // Create loop pre-header for every loop. + // Create a loop pre-header for every loop. bool modForPreHeader = false; for (unsigned loopInd = 0; loopInd < optLoopCount; loopInd++) { @@ -2729,7 +2729,6 @@ void Compiler::optFindNaturalLoops() } if (modForPreHeader) { - // The predecessors were maintained in fgCreateLoopPreHeader; don't rebuild them. fgUpdateChangedFlowGraph(FlowGraphUpdates::COMPUTE_DOMS); } @@ -4600,8 +4599,6 @@ PhaseStatus Compiler::optUnrollLoops() // // If the initBlock is a BBJ_COND drop the condition (and make initBlock a BBJ_NONE block). // - // Also, make sure HEAD is a BBJ_NONE block. - // if (initBlock->bbJumpKind == BBJ_COND) { assert(dupCond); @@ -6952,7 +6949,7 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt) ArrayStack defExec(getAllocatorLoopHoist()); // Add the pre-headers of any child loops to the list of blocks to consider for hoisting. - // Note that these are not definitely executed. However, it is a heuristic that they will + // Note that these are not necessarily definitely executed. However, it is a heuristic that they will // often provide good opportunities for further hoisting since we hoist from inside-out, // and the inner loop may have already hoisted something loop-invariant to them. If the child // loop pre-header block would be added anyway (by dominating the loop exit block), we don't @@ -6962,6 +6959,32 @@ bool Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt) // assumed that the order does not matter for correctness (since there is no execution order known). // Note that the order does matter for the hoisting profitability heuristics, as we might // run out of hoisting budget when processing the blocks. + // + // For example, consider this loop nest: + // + // for (....) { // loop L00 + // pre-header 1 + // for (...) { // loop L01 + // } + // // pre-header 2 + // for (...) { // loop L02 + // // pre-header 3 + // for (...) { // loop L03 + // } + // } + // } + // + // When processing the outer loop L00 (with an assumed single exit), we will push on the defExec stack + // pre-header 2, pre-header 1, the loop exit block, any IDom tree blocks leading to the entry block, + // and finally the entry block. (Note that the child loop iteration order of a loop is from "farthest" + // from the loop "head" to "nearest".) Blocks are considered for hoisting in the opposite order. + // + // Note that pre-header 3 is not pushed, since it is not a direct child. It would have been processed + // when loop L02 was considered for hoisting. + // + // The order of pushing pre-header 1 and pre-header 2 is based on the order in the loop table (which is + // convenient). But note that it is arbitrary because there is not guaranteed execution order amongst + // the child loops. int childLoopPreHeaders = 0; for (BasicBlock::loopNumber childLoop = pLoopDsc->lpChild; //