From b6adebcc7ef911b4ca99cc8eb9aeed1681f95391 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Thu, 7 Nov 2024 08:01:35 -0800 Subject: [PATCH 1/5] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20ch?= =?UTF-8?q?anges=20to=20main=20this=20commit=20is=20based=20on?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.4 [skip ci] --- bolt/docs/BAT.md | 12 +- bolt/include/bolt/Core/BinaryFunction.h | 4 + .../bolt/Profile/BoltAddressTranslation.h | 3 + bolt/include/bolt/Profile/DataAggregator.h | 3 +- bolt/lib/Profile/BoltAddressTranslation.cpp | 70 +++++---- bolt/lib/Profile/DataAggregator.cpp | 111 +++++++++----- bolt/test/X86/callcont-fallthru.s | 138 ++++++++++++++++++ 7 files changed, 273 insertions(+), 68 deletions(-) create mode 100644 bolt/test/X86/callcont-fallthru.s diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md index 817ad288aa34b..3b42c36541acd 100644 --- a/bolt/docs/BAT.md +++ b/bolt/docs/BAT.md @@ -54,7 +54,7 @@ Functions table: | table | | | | Secondary entry | -| points | +| points and LPs | |------------------| ``` @@ -80,7 +80,7 @@ Hot indices are delta encoded, implicitly starting at zero. | `HotIndex` | Delta, ULEB128 | Index of corresponding hot function in hot functions table | Cold | | `FuncHash` | 8b | Function hash for input function | Hot | | `NumBlocks` | ULEB128 | Number of basic blocks in the original function | Hot | -| `NumSecEntryPoints` | ULEB128 | Number of secondary entry points in the original function | Hot | +| `NumSecEntryPoints` | ULEB128 | Number of secondary entry points and landing pads in the original function | Hot | | `ColdInputSkew` | ULEB128 | Skew to apply to all input offsets | Cold | | `NumEntries` | ULEB128 | Number of address translation entries for a function | Both | | `EqualElems` | ULEB128 | Number of equal offsets in the beginning of a function | Both | @@ -116,7 +116,11 @@ input basic block mapping. ### Secondary Entry Points table The table is emitted for hot fragments only. It contains `NumSecEntryPoints` -offsets denoting secondary entry points, delta encoded, implicitly starting at zero. +offsets denoting secondary entry points and landing pads, delta encoded, +implicitly starting at zero. | Entry | Encoding | Description | | ----- | -------- | ----------- | -| `SecEntryPoint` | Delta, ULEB128 | Secondary entry point offset | +| `SecEntryPoint` | Delta, ULEB128 | Secondary entry point offset with `LPENTRY` LSB bit | + +`LPENTRY` bit denotes whether a given offset is a landing pad block. If not set, +the offset is a secondary entry point. diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 0b875ed688101..0b3682353f736 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -908,6 +908,10 @@ class BinaryFunction { return BB && BB->getOffset() == Offset ? BB : nullptr; } + const BinaryBasicBlock *getBasicBlockAtOffset(uint64_t Offset) const { + return const_cast(this)->getBasicBlockAtOffset(Offset); + } + /// Retrieve the landing pad BB associated with invoke instruction \p Invoke /// that is in \p BB. Return nullptr if none exists BinaryBasicBlock *getLandingPadBBFor(const BinaryBasicBlock &BB, diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h index fcc578f35e322..aaf361b093bfd 100644 --- a/bolt/include/bolt/Profile/BoltAddressTranslation.h +++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h @@ -182,6 +182,9 @@ class BoltAddressTranslation { /// translation map entry const static uint32_t BRANCHENTRY = 0x1; + /// Identifies a landing pad in secondary entry point map entry. + const static uint32_t LPENTRY = 0x1; + public: /// Map basic block input offset to a basic block index and hash pair. class BBHashMapTy { diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index 6453b3070ceb8..2880bfd03be78 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -266,7 +266,8 @@ class DataAggregator : public DataReader { uint64_t Mispreds); /// Register a \p Branch. - bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds); + bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds, + bool IsPreagg); /// Register a trace between two LBR entries supplied in execution order. bool doTrace(const LBREntry &First, const LBREntry &Second, diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp index 4d005f942699e..a423315df0793 100644 --- a/bolt/lib/Profile/BoltAddressTranslation.cpp +++ b/bolt/lib/Profile/BoltAddressTranslation.cpp @@ -86,21 +86,16 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) { if (Function.isIgnored() || (!BC.HasRelocations && !Function.isSimple())) continue; - uint32_t NumSecondaryEntryPoints = 0; - Function.forEachEntryPoint([&](uint64_t Offset, const MCSymbol *) { - if (!Offset) - return true; - ++NumSecondaryEntryPoints; - SecondaryEntryPointsMap[OutputAddress].push_back(Offset); - return true; - }); - LLVM_DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n"); LLVM_DEBUG(dbgs() << " Address reference: 0x" << Twine::utohexstr(Function.getOutputAddress()) << "\n"); LLVM_DEBUG(dbgs() << formatv(" Hash: {0:x}\n", getBFHash(InputAddress))); - LLVM_DEBUG(dbgs() << " Secondary Entry Points: " << NumSecondaryEntryPoints - << '\n'); + LLVM_DEBUG({ + uint32_t NumSecondaryEntryPoints = 0; + if (SecondaryEntryPointsMap.count(InputAddress)) + NumSecondaryEntryPoints = SecondaryEntryPointsMap[InputAddress].size(); + dbgs() << " Secondary Entry Points: " << NumSecondaryEntryPoints << '\n'; + }); MapTy Map; for (const BinaryBasicBlock *const BB : @@ -206,10 +201,9 @@ void BoltAddressTranslation::writeMaps(uint64_t &PrevAddress, raw_ostream &OS) { << Twine::utohexstr(Address) << ".\n"); encodeULEB128(Address - PrevAddress, OS); PrevAddress = Address; - const uint32_t NumSecondaryEntryPoints = - SecondaryEntryPointsMap.count(Address) - ? SecondaryEntryPointsMap[Address].size() - : 0; + uint32_t NumSecondaryEntryPoints = 0; + if (SecondaryEntryPointsMap.count(HotInputAddress)) + NumSecondaryEntryPoints = SecondaryEntryPointsMap[HotInputAddress].size(); uint32_t Skew = 0; if (Cold) { auto HotEntryIt = llvm::lower_bound(HotFuncs, ColdPartSource[Address]); @@ -281,7 +275,7 @@ void BoltAddressTranslation::writeMaps(uint64_t &PrevAddress, raw_ostream &OS) { if (!Cold && NumSecondaryEntryPoints) { LLVM_DEBUG(dbgs() << "Secondary entry points: "); // Secondary entry point offsets, delta-encoded - for (uint32_t Offset : SecondaryEntryPointsMap[Address]) { + for (uint32_t Offset : SecondaryEntryPointsMap[HotInputAddress]) { encodeULEB128(Offset - PrevOffset, OS); LLVM_DEBUG(dbgs() << formatv("{0:x} ", Offset)); PrevOffset = Offset; @@ -469,8 +463,12 @@ void BoltAddressTranslation::dump(raw_ostream &OS) const { const std::vector &SecondaryEntryPoints = SecondaryEntryPointsIt->second; OS << SecondaryEntryPoints.size() << " secondary entry points:\n"; - for (uint32_t EntryPointOffset : SecondaryEntryPoints) - OS << formatv("{0:x}\n", EntryPointOffset); + for (uint32_t EntryPointOffset : SecondaryEntryPoints) { + OS << formatv("{0:x}", EntryPointOffset >> 1); + if (EntryPointOffset & LPENTRY) + OS << " (lp)"; + OS << '\n'; + } } OS << "\n"; } @@ -582,14 +580,21 @@ void BoltAddressTranslation::saveMetadata(BinaryContext &BC) { // changed if (BF.isIgnored() || (!BC.HasRelocations && !BF.isSimple())) continue; + const uint64_t FuncAddress = BF.getAddress(); // Prepare function and block hashes - FuncHashes.addEntry(BF.getAddress(), BF.computeHash()); + FuncHashes.addEntry(FuncAddress, BF.computeHash()); BF.computeBlockHashes(); - BBHashMapTy &BBHashMap = getBBHashMap(BF.getAddress()); + BBHashMapTy &BBHashMap = getBBHashMap(FuncAddress); + std::vector SecondaryEntryPoints; // Set BF/BB metadata - for (const BinaryBasicBlock &BB : BF) + for (const BinaryBasicBlock &BB : BF) { BBHashMap.addEntry(BB.getInputOffset(), BB.getIndex(), BB.getHash()); - NumBasicBlocksMap.emplace(BF.getAddress(), BF.size()); + bool IsLandingPad = BB.isLandingPad(); + if (IsLandingPad || BF.getSecondaryEntryPointSymbol(BB)) + SecondaryEntryPoints.emplace_back(BB.getOffset() << 1 | IsLandingPad); + } + SecondaryEntryPointsMap.emplace(FuncAddress, SecondaryEntryPoints); + NumBasicBlocksMap.emplace(FuncAddress, BF.size()); } } @@ -599,13 +604,20 @@ BoltAddressTranslation::getSecondaryEntryPointId(uint64_t Address, auto FunctionIt = SecondaryEntryPointsMap.find(Address); if (FunctionIt == SecondaryEntryPointsMap.end()) return 0; - const std::vector &Offsets = FunctionIt->second; - auto OffsetIt = std::find(Offsets.begin(), Offsets.end(), Offset); - if (OffsetIt == Offsets.end()) - return 0; - // Adding one here because main entry point is not stored in BAT, and - // enumeration for secondary entry points starts with 1. - return OffsetIt - Offsets.begin() + 1; + unsigned EntryPoints = 0; + // Note we need to scan the vector to get the entry point id because it + // contains both entry points and landing pads. + for (uint32_t Off : FunctionIt->second) { + // Skip landing pads. + if (Off & LPENTRY) + continue; + // Adding one here because main entry point is not stored in BAT, and + // enumeration for secondary entry points starts with 1. + if (Off >> 1 == Offset) + return EntryPoints + 1; + ++EntryPoints; + } + return 0; } std::pair diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index ffd693f9bbaed..f667fcd4f049a 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -778,42 +778,75 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, } bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, - uint64_t Mispreds) { - bool IsReturn = false; - auto handleAddress = [&](uint64_t &Addr, bool IsFrom) -> BinaryFunction * { - if (BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr)) { - Addr -= Func->getAddress(); - if (IsFrom) { - auto checkReturn = [&](auto MaybeInst) { - IsReturn = MaybeInst && BC->MIB->isReturn(*MaybeInst); - }; - if (Func->hasInstructions()) - checkReturn(Func->getInstructionAtOffset(Addr)); - else - checkReturn(Func->disassembleInstructionAtOffset(Addr)); - } + uint64_t Mispreds, bool IsPreagg) { + // Returns whether \p Offset in \p Func contains a return instruction. + auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) { + auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); }; + return Func.hasInstructions() + ? isReturn(Func.getInstructionAtOffset(Offset)) + : isReturn(Func.disassembleInstructionAtOffset(Offset)); + }; - if (BAT) - Addr = BAT->translate(Func->getAddress(), Addr, IsFrom); + // Returns whether \p Offset in \p Func may be a call continuation excluding + // entry points and landing pads. + auto checkCallCont = [&](const BinaryFunction &Func, const uint64_t Offset) { + // No call continuation at a function start. + if (!Offset) + return false; + + // FIXME: support BAT case where the function might be in empty state + // (split fragments declared non-simple). + if (!Func.hasCFG()) + return false; + + // The offset should not be an entry point or a landing pad. + const BinaryBasicBlock *ContBB = Func.getBasicBlockAtOffset(Offset); + return ContBB && !ContBB->isEntryPoint() && !ContBB->isLandingPad(); + }; - if (BinaryFunction *ParentFunc = getBATParentFunction(*Func)) { - Func = ParentFunc; - if (IsFrom) - NumColdSamples += Count; - } + // Mutates \p Addr to an offset into the containing function, performing BAT + // offset translation and parent lookup. + // + // Returns the containing function (or BAT parent) and whether the address + // corresponds to a return (if \p IsFrom) or a call continuation (otherwise). + auto handleAddress = [&](uint64_t &Addr, bool IsFrom) { + BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr); + if (!Func) + return std::pair{Func, false}; - return Func; - } - return nullptr; + Addr -= Func->getAddress(); + + bool IsRetOrCallCont = + IsFrom ? checkReturn(*Func, Addr) : checkCallCont(*Func, Addr); + + if (BAT) + Addr = BAT->translate(Func->getAddress(), Addr, IsFrom); + + BinaryFunction *ParentFunc = getBATParentFunction(*Func); + if (!ParentFunc) + return std::pair{Func, IsRetOrCallCont}; + + if (IsFrom) + NumColdSamples += Count; + + return std::pair{ParentFunc, IsRetOrCallCont}; }; - BinaryFunction *FromFunc = handleAddress(From, /*IsFrom=*/true); + uint64_t ToOrig = To; + auto [FromFunc, IsReturn] = handleAddress(From, /*IsFrom=*/true); + auto [ToFunc, IsCallCont] = handleAddress(To, /*IsFrom=*/false); + if (!FromFunc && !ToFunc) + return false; + + // Record call to continuation trace. + if (IsPreagg && FromFunc != ToFunc && (IsReturn || IsCallCont)) { + LBREntry First{ToOrig - 1, ToOrig - 1, false}; + LBREntry Second{ToOrig, ToOrig, false}; + return doTrace(First, Second, Count); + } // Ignore returns. if (IsReturn) return true; - BinaryFunction *ToFunc = handleAddress(To, /*IsFrom=*/false); - if (!FromFunc && !ToFunc) - return false; // Treat recursive control transfers as inter-branches. if (FromFunc == ToFunc && To != 0) { @@ -830,10 +863,19 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second, BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(Second.From); if (!FromFunc || !ToFunc) { LLVM_DEBUG({ - dbgs() << "Out of range trace starting in " << FromFunc->getPrintName() - << formatv(" @ {0:x}", First.To - FromFunc->getAddress()) - << " and ending in " << ToFunc->getPrintName() - << formatv(" @ {0:x}\n", Second.From - ToFunc->getAddress()); + dbgs() << "Out of range trace starting in "; + if (FromFunc) + dbgs() << formatv("{0} @ {1:x}", *FromFunc, + First.To - FromFunc->getAddress()); + else + dbgs() << Twine::utohexstr(First.To); + dbgs() << " and ending in "; + if (ToFunc) + dbgs() << formatv("{0} @ {1:x}", *ToFunc, + Second.From - ToFunc->getAddress()); + else + dbgs() << Twine::utohexstr(Second.From); + dbgs() << '\n'; }); NumLongRangeTraces += Count; return false; @@ -1620,7 +1662,8 @@ void DataAggregator::processBranchEvents() { for (const auto &AggrLBR : BranchLBRs) { const Trace &Loc = AggrLBR.first; const TakenBranchInfo &Info = AggrLBR.second; - doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount); + doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount, + /*IsPreagg*/ false); } } @@ -1781,7 +1824,7 @@ void DataAggregator::processPreAggregated() { switch (AggrEntry.EntryType) { case AggregatedLBREntry::BRANCH: doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count, - AggrEntry.Mispreds); + AggrEntry.Mispreds, /*IsPreagg=*/true); break; case AggregatedLBREntry::FT: case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: { diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s new file mode 100644 index 0000000000000..641beb79ecf2a --- /dev/null +++ b/bolt/test/X86/callcont-fallthru.s @@ -0,0 +1,138 @@ +## Ensures that a call continuation fallthrough count is set when using +## pre-aggregated perf data. + +# RUN: %clangxx %cxxflags %s -o %t -Wl,-q -nostdlib +# RUN: link_fdata %s %t %t.pa1 PREAGG +# RUN: link_fdata %s %t %t.pa2 PREAGG2 +# RUN: link_fdata %s %t %t.pa3 PREAGG3 +# RUN: link_fdata %s %t %t.pa4 PREAGG4 + +## Check normal case: fallthrough is not LP or secondary entry. +# RUN: llvm-strip --strip-unneeded %t -o %t.exe +# RUN: llvm-bolt %t.exe --pa -p %t.pa1 -o %t.out \ +# RUN: --print-cfg --print-only=main | FileCheck %s + +## Check that getFallthroughsInTrace correctly handles a trace starting at plt +## call continuation +# RUN: llvm-bolt %t.exe --pa -p %t.pa2 -o %t.out2 \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK2 + +## Check that we don't treat secondary entry points as call continuation sites. +# RUN: llvm-bolt %t --pa -p %t.pa3 -o %t.out \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3 + +## Check fallthrough to a landing pad case. +# RUN: llvm-bolt %t.exe --pa -p %t.pa4 -o %t.out --enable-bat \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK4 + +## Check that a landing pad is emitted in BAT +# RUN: llvm-bat-dump %t.out --dump-all | FileCheck %s --check-prefix=CHECK-BAT + +# CHECK-BAT: 1 secondary entry points: +# CHECK-BAT-NEXT: 0x38 (lp) + + .globl foo + .type foo, %function +foo: + pushq %rbp + movq %rsp, %rbp + popq %rbp +Lfoo_ret: + retq +.size foo, .-foo + + .globl main + .type main, %function +main: +.Lfunc_begin0: + .cfi_startproc + .cfi_personality 155, DW.ref.__gxx_personality_v0 + .cfi_lsda 27, .Lexception0 + pushq %rbp + movq %rsp, %rbp + subq $0x20, %rsp + movl $0x0, -0x4(%rbp) + movl %edi, -0x8(%rbp) + movq %rsi, -0x10(%rbp) + callq puts@PLT +## Target is a call continuation +# PREAGG: B X:0 #Ltmp1# 2 0 +# CHECK: callq puts@PLT +# CHECK-NEXT: count: 2 + +Ltmp1: + movq -0x10(%rbp), %rax + movq 0x8(%rax), %rdi + movl %eax, -0x14(%rbp) + +Ltmp4: + cmpl $0x0, -0x14(%rbp) + je Ltmp0 +# CHECK2: je .Ltmp0 +# CHECK2-NEXT: count: 3 + + movl $0xa, -0x18(%rbp) + callq foo +## Target is a call continuation +# PREAGG: B #Lfoo_ret# #Ltmp3# 1 0 +# CHECK: callq foo +# CHECK-NEXT: count: 1 + +## PLT call continuation fallthrough spanning the call +# PREAGG2: F #Ltmp1# #Ltmp3_br# 3 +# CHECK2: callq foo +# CHECK2-NEXT: count: 3 + +## Target is a secondary entry point +# PREAGG3: B X:0 #Ltmp3# 2 0 +# CHECK3: callq foo +# CHECK3-NEXT: count: 0 + +## Target is a landing pad +# PREAGG4: B X:0 #Ltmp3# 2 0 +# CHECK4: callq puts@PLT +# CHECK4-NEXT: count: 0 + +Ltmp3: + cmpl $0x0, -0x18(%rbp) +Ltmp3_br: + jmp Ltmp2 + +Ltmp2: + movl -0x18(%rbp), %eax + addl $-0x1, %eax + movl %eax, -0x18(%rbp) + jmp Ltmp3 + jmp Ltmp4 + jmp Ltmp1 + +Ltmp0: + xorl %eax, %eax + addq $0x20, %rsp + popq %rbp + retq +.Lfunc_end0: + .cfi_endproc +.size main, .-main + + .section .gcc_except_table,"a",@progbits + .p2align 2, 0x0 +GCC_except_table0: +.Lexception0: + .byte 255 # @LPStart Encoding = omit + .byte 255 # @TType Encoding = omit + .byte 1 # Call site Encoding = uleb128 + .uleb128 .Lcst_end0-.Lcst_begin0 +.Lcst_begin0: + .uleb128 .Lfunc_begin0-.Lfunc_begin0 # >> Call Site 1 << + .uleb128 .Lfunc_end0-.Lfunc_begin0 # Call between .Lfunc_begin0 and .Lfunc_end0 + .uleb128 Ltmp3-.Lfunc_begin0 # jumps to Ltmp3 + .byte 0 # has no landing pad + .byte 0 # On action: cleanup +.Lcst_end0: + .p2align 2, 0x0 + .hidden DW.ref.__gxx_personality_v0 + .weak DW.ref.__gxx_personality_v0 + .section .data.DW.ref.__gxx_personality_v0,"awG",@progbits,DW.ref.__gxx_personality_v0,comdat + .p2align 3, 0x0 + .type DW.ref.__gxx_personality_v0,@object From 9476fad1aa50282a38614a63a6a5a41f0ac42532 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Wed, 27 Nov 2024 15:59:02 +0100 Subject: [PATCH 2/5] Preserve CFG until BAT, use it to check call cont landing pads, encode them in secondary entry points table --- bolt/docs/BAT.md | 20 +--- .../bolt/Profile/BoltAddressTranslation.h | 26 +---- bolt/lib/Core/BinaryEmitter.cpp | 3 +- bolt/lib/Profile/BoltAddressTranslation.cpp | 104 +++++------------- bolt/lib/Profile/DataAggregator.cpp | 17 +-- 5 files changed, 44 insertions(+), 126 deletions(-) diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md index 20340884621b9..828114310e195 100644 --- a/bolt/docs/BAT.md +++ b/bolt/docs/BAT.md @@ -115,21 +115,13 @@ Deleted basic blocks are emitted as having `OutputOffset` equal to the size of the function. They don't affect address translation and only participate in input basic block mapping. -### Secondary Entry Points table +### Secondary Entry Points and Call Continuation Landing Pads table The table is emitted for hot fragments only. It contains `NumSecEntryPoints` -offsets denoting secondary entry points, delta encoded, implicitly starting at zero. +offsets, delta encoded, implicitly starting at zero. | Entry | Encoding | Description | | ----- | -------- | ----------- | -| `SecEntryPoint` | Delta, ULEB128 | Secondary entry point offset | +| `OutputOffset` | Delta, ULEB128 | An offset of secondary entry point or a call continuation landing pad\*| -### Call continuation landing pads table -This table contains the addresses of call continuation blocks that are also -landing pads, to aid pre-aggregated profile conversion. The table is optional -for backwards compatibility, but new versions of BOLT will always emit it. - -| Entry | Encoding | Description | -| ----- | -------- | ----------- | -| `NumEntries` | ULEB128 | Number of addresses | -| `InputAddress` | Delta, ULEB128 | `NumEntries` input addresses of call continuation landing pad blocks | - -Addresses are delta encoded, implicitly starting at zero. +Call continuation landing pads offsets are shifted by the size of the function +for backwards compatibility (treated as entry points past the end of the +function). diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h index b04ed7a82eeef..f956f48b8356b 100644 --- a/bolt/include/bolt/Profile/BoltAddressTranslation.h +++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h @@ -143,20 +143,12 @@ class BoltAddressTranslation { /// Write the serialized address translation table for a function. template void writeMaps(uint64_t &PrevAddress, raw_ostream &OS); - /// Write call continuation landing pad addresses. - void writeCallContLandingPads(raw_ostream &OS); - /// Read the serialized address translation table for a function. /// Return a parse error if failed. template void parseMaps(uint64_t &PrevAddress, DataExtractor &DE, uint64_t &Offset, Error &Err); - - /// Read the table with call continuation landing pad offsets. - void parseCallContLandingPads(DataExtractor &DE, uint64_t &Offset, - Error &Err); - /// Returns the bitmask with set bits corresponding to indices of BRANCHENTRY /// entries in function address translation map. APInt calculateBranchEntriesBitMask(MapTy &Map, size_t EqualElems) const; @@ -176,14 +168,6 @@ class BoltAddressTranslation { /// Map a function to its secondary entry points vector std::unordered_map> SecondaryEntryPointsMap; - /// Vector with call continuation landing pads input addresses (pre-BOLT - /// binary). - std::vector CallContLandingPadAddrs; - - /// Return a secondary entry point ID for a function located at \p Address and - /// \p Offset within that function. - unsigned getSecondaryEntryPointId(uint64_t Address, uint32_t Offset) const; - /// Links outlined cold bocks to their original function std::map ColdPartSource; @@ -195,13 +179,9 @@ class BoltAddressTranslation { const static uint32_t BRANCHENTRY = 0x1; public: - /// Returns whether a given \p Offset is a secondary entry point in function - /// with address \p Address. - bool isSecondaryEntry(uint64_t Address, uint32_t Offset) const; - - /// Returns whether a given \p Offset is a call continuation landing pad in - /// function with address \p Address. - bool isCallContinuationLandingPad(uint64_t Address, uint32_t Offset) const; + /// Return a secondary entry point ID for a function located at \p Address and + /// \p Offset within that function. + unsigned getSecondaryEntryPointId(uint64_t Address, uint32_t Offset) const; /// Map basic block input offset to a basic block index and hash pair. class BBHashMapTy { diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index f34a94c577921..31b93418f5394 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -257,7 +257,8 @@ void BinaryEmitter::emitFunctions() { Streamer.setAllowAutoPadding(OriginalAllowAutoPadding); if (Emitted) - Function->setEmitted(/*KeepCFG=*/opts::PrintCacheMetrics); + Function->setEmitted(/*KeepCFG=*/opts::PrintCacheMetrics || + opts::EnableBAT); } }; diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp index 49b272ebd35c1..6c26ed3e26b41 100644 --- a/bolt/lib/Profile/BoltAddressTranslation.cpp +++ b/bolt/lib/Profile/BoltAddressTranslation.cpp @@ -87,13 +87,28 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) { continue; uint32_t NumSecondaryEntryPoints = 0; - Function.forEachEntryPoint([&](uint64_t Offset, const MCSymbol *) { - if (!Offset) - return true; + for (const BinaryBasicBlock &BB : llvm::drop_begin(Function)) { + if (BB.isEntryPoint()) { + ++NumSecondaryEntryPoints; + SecondaryEntryPointsMap[OutputAddress].push_back(BB.getOffset()); + continue; + } + // Add call continuation landing pads, offset by function size + if (!BB.isLandingPad()) + continue; + const BinaryBasicBlock *PrevBB = + Function.getLayout().getBlock(BB.getIndex() - 1); + if (!PrevBB->isSuccessor(&BB)) + continue; + const MCInst *Instr = PrevBB->getLastNonPseudoInstr(); + if (!Instr || !BC.MIB->isCall(*Instr)) + continue; ++NumSecondaryEntryPoints; - SecondaryEntryPointsMap[OutputAddress].push_back(Offset); - return true; - }); + SecondaryEntryPointsMap[OutputAddress].push_back( + Function.getOutputSize() + BB.getOffset()); + } + if (NumSecondaryEntryPoints) + llvm::sort(SecondaryEntryPointsMap[OutputAddress]); LLVM_DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n"); LLVM_DEBUG(dbgs() << " Address reference: 0x" @@ -145,7 +160,6 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) { uint64_t PrevAddress = 0; writeMaps(PrevAddress, OS); writeMaps(PrevAddress, OS); - writeCallContLandingPads(OS); BC.outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n"; BC.outs() << "BOLT-INFO: Wrote " << FuncHashes.getNumFunctions() @@ -292,15 +306,6 @@ void BoltAddressTranslation::writeMaps(uint64_t &PrevAddress, raw_ostream &OS) { } } -void BoltAddressTranslation::writeCallContLandingPads(raw_ostream &OS) { - encodeULEB128(CallContLandingPadAddrs.size(), OS); - uint64_t PrevAddress = 0; - for (const uint64_t Address : CallContLandingPadAddrs) { - encodeULEB128(Address - PrevAddress, OS); - PrevAddress = Address; - } -} - std::error_code BoltAddressTranslation::parse(raw_ostream &OS, StringRef Buf) { DataExtractor DE = DataExtractor(Buf, true, 8); uint64_t Offset = 0; @@ -325,8 +330,6 @@ std::error_code BoltAddressTranslation::parse(raw_ostream &OS, StringRef Buf) { parseMaps(PrevAddress, DE, Offset, Err); parseMaps(PrevAddress, DE, Offset, Err); OS << "BOLT-INFO: Parsed " << Maps.size() << " BAT entries\n"; - if (Offset < Buf.size()) - parseCallContLandingPads(DE, Offset, Err); return errorToErrorCode(std::move(Err)); } @@ -446,21 +449,6 @@ void BoltAddressTranslation::parseMaps(uint64_t &PrevAddress, DataExtractor &DE, } } -void BoltAddressTranslation::parseCallContLandingPads(DataExtractor &DE, - uint64_t &Offset, - Error &Err) { - const uint32_t NumEntries = DE.getULEB128(&Offset, &Err); - LLVM_DEBUG(dbgs() << "Parsing " << NumEntries - << " call continuation landing pad entries\n"); - CallContLandingPadAddrs.reserve(NumEntries); - uint64_t PrevAddress = 0; - for (uint32_t I = 0; I < NumEntries; ++I) { - const uint64_t Address = PrevAddress + DE.getULEB128(&Offset, &Err); - CallContLandingPadAddrs.emplace_back(Address); - PrevAddress = Address; - } -} - void BoltAddressTranslation::dump(raw_ostream &OS) const { const size_t NumTables = Maps.size(); OS << "BAT tables for " << NumTables << " functions:\n"; @@ -609,25 +597,14 @@ void BoltAddressTranslation::saveMetadata(BinaryContext &BC) { // changed if (BF.isIgnored() || (!BC.HasRelocations && !BF.isSimple())) continue; - const uint64_t FuncAddress = BF.getAddress(); // Prepare function and block hashes - FuncHashes.addEntry(FuncAddress, BF.computeHash()); + FuncHashes.addEntry(BF.getAddress(), BF.computeHash()); BF.computeBlockHashes(); - BBHashMapTy &BBHashMap = getBBHashMap(FuncAddress); + BBHashMapTy &BBHashMap = getBBHashMap(BF.getAddress()); // Set BF/BB metadata - for (const BinaryBasicBlock &BB : BF) { - const uint32_t BlockOffset = BB.getInputOffset(); - BBHashMap.addEntry(BlockOffset, BB.getIndex(), BB.getHash()); - // Set CallContLandingPads - if (!BB.isEntryPoint() && BB.isLandingPad()) { - const BinaryBasicBlock *PrevBB = - BF.getLayout().getBlock(BB.getIndex() - 1); - const MCInst *Instr = PrevBB->getLastNonPseudoInstr(); - if (Instr && BC.MIB->isCall(*Instr)) - CallContLandingPadAddrs.emplace_back(FuncAddress + BlockOffset); - } - } - NumBasicBlocksMap.emplace(FuncAddress, BF.size()); + for (const BinaryBasicBlock &BB : BF) + BBHashMap.addEntry(BB.getInputOffset(), BB.getIndex(), BB.getHash()); + NumBasicBlocksMap.emplace(BF.getAddress(), BF.size()); } } @@ -638,8 +615,8 @@ BoltAddressTranslation::getSecondaryEntryPointId(uint64_t Address, if (FunctionIt == SecondaryEntryPointsMap.end()) return 0; const std::vector &Offsets = FunctionIt->second; - auto OffsetIt = std::find(Offsets.begin(), Offsets.end(), Offset); - if (OffsetIt == Offsets.end()) + auto OffsetIt = llvm::lower_bound(FunctionIt->second, Offset); + if (OffsetIt == Offsets.end() || *OffsetIt != Offset) return 0; // Adding one here because main entry point is not stored in BAT, and // enumeration for secondary entry points starts with 1. @@ -675,30 +652,5 @@ BoltAddressTranslation::translateSymbol(const BinaryContext &BC, return std::pair(ParentBF, SecondaryEntryId); } -bool BoltAddressTranslation::isSecondaryEntry(uint64_t OutputAddress, - uint32_t Offset) const { - const uint64_t InputOffset = - translate(OutputAddress, Offset, /*IsBranchSrc*/ false); - - const uint64_t HotAddress = fetchParentAddress(OutputAddress); - auto MapsIter = Maps.find(HotAddress ? HotAddress : OutputAddress); - if (MapsIter == Maps.end()) - return false; - - const uint64_t InputAddress = MapsIter->second.begin()->second; - - auto FunctionIt = SecondaryEntryPointsMap.find(Address); - if (FunctionIt == SecondaryEntryPointsMap.end()) - return false; - const std::vector &Offsets = FunctionIt->second; - uint64_t InputOffset = translate(OutputAddress, Offset, /*IsBranchSrc*/ false); - auto OffsetIt = llvm::lower_bound(Offsets, InputOffset << 1); - return OffsetIt != Offsets.end() && *OffsetIt >> 1 == InputOffset; -} - -bool BoltAddressTranslation::isCallContinuationLandingPad( - uint64_t Address, uint32_t Offset) const { -} - } // namespace bolt } // namespace llvm diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index c471e97565a57..fcd3cd90271a2 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -809,18 +809,11 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, return false; if (!Func.hasCFG()) { - if (!BAT) - return false; - const uint64_t FuncAddress = Func.getAddress(); - const BinaryData *BD = - BC->getBinaryDataAtAddress(FuncAddress + Offset); - unsigned EntryID = 0; - if (BD && BD->getSymbol()) - EntryID = BAT->translateSymbol(*BC, *BD->getSymbol(), 0).second; - const MCSymbol &Symbol = - const auto [Function, EntryID] = BAT->translateSymbol(BC, - return BAT && !(BAT->isSecondaryEntry(FuncAddress, Offset) || - BAT->isCallContinuationLandingPad(FuncAddress, Offset)); + const uint64_t Address = Func.getAddress(); + // Check if offset is a secondary entry point or a call continuation + // landing pad (offset shifted by function size). + return BAT && !BAT->getSecondaryEntryPointId(Address, Offset) && + !BAT->getSecondaryEntryPointId(Address, Func.getSize() + Offset); } // The offset should not be an entry point or a landing pad. From fe9900b5bbb20b8fe3b036d7636c2bf8e25e6c88 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Wed, 27 Nov 2024 07:40:26 -0800 Subject: [PATCH 3/5] Preserve CFG until BAT, use it to check call cont landing pads, encode them in secondary entry points table Created using spr 1.3.4 --- bolt/docs/BAT.md | 1 - 1 file changed, 1 deletion(-) diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md index 828114310e195..afbd59220b045 100644 --- a/bolt/docs/BAT.md +++ b/bolt/docs/BAT.md @@ -44,7 +44,6 @@ The general layout is as follows: ``` Hot functions table Cold functions table -Call continuation landing pads table (optional) Functions table: |------------------| From 567d8103a7d9175e9ec5ac366fff73091a6032ca Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Wed, 27 Nov 2024 08:17:09 -0800 Subject: [PATCH 4/5] update test Created using spr 1.3.4 --- bolt/test/X86/callcont-fallthru.s | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s index 467600abfb694..f6f2c7d117a08 100644 --- a/bolt/test/X86/callcont-fallthru.s +++ b/bolt/test/X86/callcont-fallthru.s @@ -25,28 +25,27 @@ # RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3 ## Check that a landing pad is emitted in BAT -# RUN: llvm-bat-dump %t.out --dump-all | FileCheck %s --check-prefix=CHECK-BAT +# RUN: llvm-bat-dump %t.out4 --dump-all | FileCheck %s --check-prefix=CHECK-BAT # CHECK-BAT: 1 secondary entry points: -# CHECK-BAT-NEXT: 0x38 (lp) ## Check BAT case of a fallthrough to a call continuation -# link_fdata %s %t.out3 %t.pa.bat PREAGG -# RUN: perf2bolt %t.out3 -p %t.pa.bat --pa -o %t.fdata +# link_fdata %s %t.out4 %t.pa.bat PREAGG +# RUN: perf2bolt %t.out4 -p %t.pa.bat --pa -o %t.fdata # RUN: FileCheck %s --check-prefix=CHECK-BAT-CC --input-file=%t.fdata # CHECK-BAT-CC: main ## Check BAT case of a fallthrough to a secondary entry point or a landing pad -# link_fdata %s %t.out3 %t.pa.bat2 PREAGG3 +# link_fdata %s %t.out4 %t.pa.bat2 PREAGG3 ## Secondary entry -# RUN: perf2bolt %t.out3 -p %t.pa.bat2 --pa -o %t.fdata2 +# RUN: perf2bolt %t.out4 -p %t.pa.bat2 --pa -o %t.fdata2 # RUN: FileCheck %s --check-prefix=CHECK-BAT-ENTRY --input-file=%t.fdata2 # CHECK-BAT-ENTRY: main ## Landing pad -# RUN: llvm-strip --strip-unneeded %t.out3 -# RUN: perf2bolt %t.out3 -p %t.pa.bat2 --pa -o %t.fdata3 +# RUN: llvm-strip --strip-unneeded %t.out4 +# RUN: perf2bolt %t.out4 -p %t.pa.bat2 --pa -o %t.fdata3 # RUN: FileCheck %s --check-prefix=CHECK-BAT-LP --input-file=%t.fdata3 # CHECK-BAT-LP: main From 4310965f2e0fcf28d4a7f33c9e91cbeddeb2cae4 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Wed, 27 Nov 2024 14:26:05 -0800 Subject: [PATCH 5/5] Offset CallContLPs by MaxInputOffset + 1 Created using spr 1.3.4 --- bolt/lib/Profile/BoltAddressTranslation.cpp | 10 ++++-- bolt/lib/Profile/DataAggregator.cpp | 7 +++-- bolt/test/X86/callcont-fallthru.s | 35 +++++++++++++-------- 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp index 6c26ed3e26b41..2f8bccf4afb5d 100644 --- a/bolt/lib/Profile/BoltAddressTranslation.cpp +++ b/bolt/lib/Profile/BoltAddressTranslation.cpp @@ -87,6 +87,11 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) { continue; uint32_t NumSecondaryEntryPoints = 0; + // Offset call continuation landing pads by max input offset + 1 to prevent + // confusing them with real entry points. Note we can't use the input size + // as it's not available in BOLTed binary. + const BBHashMapTy &BBHashMap = getBBHashMap(InputAddress); + const uint32_t CallContLPOffset = std::prev(BBHashMap.end())->first + 1; for (const BinaryBasicBlock &BB : llvm::drop_begin(Function)) { if (BB.isEntryPoint()) { ++NumSecondaryEntryPoints; @@ -104,8 +109,8 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) { if (!Instr || !BC.MIB->isCall(*Instr)) continue; ++NumSecondaryEntryPoints; - SecondaryEntryPointsMap[OutputAddress].push_back( - Function.getOutputSize() + BB.getOffset()); + SecondaryEntryPointsMap[OutputAddress].push_back(CallContLPOffset + + BB.getOffset()); } if (NumSecondaryEntryPoints) llvm::sort(SecondaryEntryPointsMap[OutputAddress]); @@ -124,7 +129,6 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) { // Add entries for deleted blocks. They are still required for correct BB // mapping of branches modified by SCTC. By convention, they would have the // end of the function as output address. - const BBHashMapTy &BBHashMap = getBBHashMap(InputAddress); if (BBHashMap.size() != Function.size()) { const uint64_t EndOffset = Function.getOutputSize(); std::unordered_set MappedInputOffsets; diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index fcd3cd90271a2..32c1d82c8c31f 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -810,10 +810,13 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, if (!Func.hasCFG()) { const uint64_t Address = Func.getAddress(); + if (!BAT) + return false; + const uint32_t InputOffset = BAT->translate(Address, Offset, false); // Check if offset is a secondary entry point or a call continuation // landing pad (offset shifted by function size). - return BAT && !BAT->getSecondaryEntryPointId(Address, Offset) && - !BAT->getSecondaryEntryPointId(Address, Func.getSize() + Offset); + return !BAT->getSecondaryEntryPointId(Address, InputOffset) && + !BAT->getSecondaryEntryPointId(Address, Func.getSize() + InputOffset); } // The offset should not be an entry point or a landing pad. diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s index f6f2c7d117a08..d463c7f8eebc8 100644 --- a/bolt/test/X86/callcont-fallthru.s +++ b/bolt/test/X86/callcont-fallthru.s @@ -21,34 +21,40 @@ # RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3 ## Check fallthrough to a landing pad case. -# RUN: llvm-bolt %t.exe --pa -p %t.pa3 -o %t.out4 --enable-bat \ +# RUN: llvm-bolt %t.exe --pa -p %t.pa3 -o %t.out4 \ # RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3 -## Check that a landing pad is emitted in BAT -# RUN: llvm-bat-dump %t.out4 --dump-all | FileCheck %s --check-prefix=CHECK-BAT +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown --defsym GLOBL=1 \ +# RUN: %s -o %t.o +# RUN: %clangxx %cxxflags %t.o -o %t2 -Wl,-q -nostdlib +# RUN: llvm-bolt %t2 --pa -p %t.pa3 -o %t.bat --enable-bat \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3 -# CHECK-BAT: 1 secondary entry points: +## Check that a landing pad is emitted in BAT +# RUN: llvm-bat-dump %t.bat --dump-all | FileCheck %s --check-prefix=CHECK-BAT -## Check BAT case of a fallthrough to a call continuation -# link_fdata %s %t.out4 %t.pa.bat PREAGG -# RUN: perf2bolt %t.out4 -p %t.pa.bat --pa -o %t.fdata -# RUN: FileCheck %s --check-prefix=CHECK-BAT-CC --input-file=%t.fdata -# CHECK-BAT-CC: main +# CHECK-BAT: secondary entry points: ## Check BAT case of a fallthrough to a secondary entry point or a landing pad -# link_fdata %s %t.out4 %t.pa.bat2 PREAGG3 +# RUN: link_fdata %s %t.bat %t.pa.bat2 PREAGG3 ## Secondary entry -# RUN: perf2bolt %t.out4 -p %t.pa.bat2 --pa -o %t.fdata2 +# RUN: perf2bolt %t.bat -p %t.pa.bat2 --pa -o %t.fdata2 # RUN: FileCheck %s --check-prefix=CHECK-BAT-ENTRY --input-file=%t.fdata2 # CHECK-BAT-ENTRY: main ## Landing pad -# RUN: llvm-strip --strip-unneeded %t.out4 -# RUN: perf2bolt %t.out4 -p %t.pa.bat2 --pa -o %t.fdata3 +# RUN: llvm-strip --strip-unneeded %t.bat +# RUN: perf2bolt %t.bat -p %t.pa.bat2 --pa -o %t.fdata3 # RUN: FileCheck %s --check-prefix=CHECK-BAT-LP --input-file=%t.fdata3 # CHECK-BAT-LP: main +## Check BAT case of a fallthrough to a call continuation +# link_fdata %s %t.bat %t.pa.bat PREAGG +# RUN: perf2bolt %t.bat -p %t.pa.bat --pa -o %t.fdata +# RUN: FileCheck %s --check-prefix=CHECK-BAT-CC --input-file=%t.fdata +# CHECK-BAT-CC: main + .globl foo .type foo, %function foo: @@ -107,6 +113,9 @@ Ltmp4: # CHECK3: callq foo # CHECK3-NEXT: count: 0 +.ifdef GLOBL +.globl Ltmp3 +.endif Ltmp3: cmpl $0x0, -0x18(%rbp) Ltmp3_br: