diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp index a6ec6c1207767..0edaccc844df4 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp @@ -127,15 +127,19 @@ static uint64_t computeStackId(const memprof::Frame &Frame) { return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column); } +static AllocationType getAllocType(const AllocationInfo *AllocInfo) { + return getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(), + AllocInfo->Info.getAllocCount(), + AllocInfo->Info.getTotalLifetime()); +} + static AllocationType addCallStack(CallStackTrie &AllocTrie, const AllocationInfo *AllocInfo, uint64_t FullStackId) { SmallVector StackIds; for (const auto &StackFrame : AllocInfo->CallStack) StackIds.push_back(computeStackId(StackFrame)); - auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(), - AllocInfo->Info.getAllocCount(), - AllocInfo->Info.getTotalLifetime()); + auto AllocType = getAllocType(AllocInfo); std::vector ContextSizeInfo; if (recordContextSizeInfoForAnalysis()) { auto TotalSize = AllocInfo->Info.getTotalSize(); @@ -406,22 +410,39 @@ handleAllocSite(Instruction &I, CallBase *CI, const std::set &AllocInfoSet, std::map, AllocMatchInfo> &FullStackIdToAllocMatchInfo) { + // TODO: Remove this once the profile creation logic deduplicates contexts + // that are the same other than the IsInlineFrame bool. Until then, keep the + // largest. + DenseMap UniqueFullContextIdAllocInfo; + for (auto *AllocInfo : AllocInfoSet) { + auto FullStackId = computeFullStackId(AllocInfo->CallStack); + auto [It, Inserted] = + UniqueFullContextIdAllocInfo.insert({FullStackId, AllocInfo}); + // If inserted entry, done. + if (Inserted) + continue; + // Keep the larger one, or the noncold one if they are the same size. + auto CurSize = It->second->Info.getTotalSize(); + auto NewSize = AllocInfo->Info.getTotalSize(); + if ((CurSize > NewSize) || + (CurSize == NewSize && + getAllocType(AllocInfo) != AllocationType::NotCold)) + continue; + It->second = AllocInfo; + } // We may match this instruction's location list to multiple MIB // contexts. Add them to a Trie specialized for trimming the contexts to // the minimal needed to disambiguate contexts with unique behavior. CallStackTrie AllocTrie(&ORE, MaxColdSize); uint64_t TotalSize = 0; uint64_t TotalColdSize = 0; - for (auto *AllocInfo : AllocInfoSet) { + for (auto &[FullStackId, AllocInfo] : UniqueFullContextIdAllocInfo) { // Check the full inlined call stack against this one. // If we found and thus matched all frames on the call, include // this MIB. if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack, InlinedCallStack)) { NumOfMemProfMatchedAllocContexts++; - uint64_t FullStackId = 0; - if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis()) - FullStackId = computeFullStackId(AllocInfo->CallStack); auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); TotalSize += AllocInfo->Info.getTotalSize(); if (AllocType == AllocationType::Cold) diff --git a/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll b/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll new file mode 100644 index 0000000000000..5213a07d13d39 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll @@ -0,0 +1,118 @@ +;; Tests that the compiler ignores smaller contexts that differ only in the +;; IsInlineFrame bool. These map to the same full context id internally, as we +;; ignore the inline frame status which may differ in feedback compiles. +;; Presumably this happens when profiles collected from different binaries are +;; merged. If we didn't pick the largest we would default them all to noncold. + +;; Avoid failures on big-endian systems that can't read the profile properly +; REQUIRES: x86_64-linux + +;; Generate the profile and the IR. +; RUN: split-file %s %t + +;; Generate indexed profile +; RUN: llvm-profdata merge %t/memprof_diff_inline.yaml -o %t.memprofdata + +; RUN: opt < %t/memprof_diff_inline.ll -passes='memprof-use' -S -memprof-report-hinted-sizes -memprof-print-match-info 2>&1 | FileCheck %s --check-prefixes=MEMPROF + +; MEMPROF: MemProf notcold context with id 10194276560488437434 has total profiled size 200 is matched with 1 frames +; MEMPROF: MemProf cold context with id 16342802530253093571 has total profiled size 10000 is matched with 1 frames + +;--- memprof_diff_inline.yaml +--- +HeapProfileRecords: + - GUID: _Z3foov + AllocSites: + # Small non-cold, full context id 16342802530253093571, should ignore + - Callstack: + - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 10 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + # Large cold, full context id 16342802530253093571, should keep + - Callstack: + - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: true } + - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 10000 + TotalLifetime: 200000 + TotalLifetimeAccessDensity: 0 + # Small non-cold, full context id 16342802530253093571, should ignore + - Callstack: + - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: true } + - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 100 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + # Small non-cold, full context id 10194276560488437434 + - Callstack: + - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: main, LineOffset: 9, Column: 13, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 200 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + CallSites: [] +... +;--- memprof_diff_inline.ll +; ModuleID = 'memprof_diff_inline.cc' +source_filename = "memprof_diff_inline.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%"struct.std::nothrow_t" = type { i8 } + +@_ZSt7nothrow = external global %"struct.std::nothrow_t", align 1 + +define dso_local noundef ptr @_Z3foov() !dbg !10 { +entry: + ; MEMPROF: call {{.*}} @_Znwm{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]] + %call = call noalias noundef align 32 ptr @_Znwm(i64 noundef 32) #6, !dbg !13 + ret ptr %call +} + +declare noundef ptr @_Znwm(i64 noundef) + +attributes #6 = { builtin allocsize(0) } + +; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]]} + +; MEMPROF: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"notcold", ![[CONTEXTSIZE1:[0-9]+]]} +; MEMPROF: ![[STACK1]] = !{i64 2732490490862098848, i64 8467819354083268568, i64 9086428284934609951, i64 2061451396820446691} +;; Full context id 10194276560488437434 == -8252467513221114182 +; MEMPROF: ![[CONTEXTSIZE1]] = !{i64 -8252467513221114182, i64 200} + +; MEMPROF: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"cold", ![[CONTEXTSIZE2:[0-9]+]]} +; MEMPROF: ![[STACK2]] = !{i64 2732490490862098848, i64 8467819354083268568, i64 9086428284934609951, i64 -5747251260480066785} +;; Full context id 16342802530253093571 == -2103941543456458045 +;; We should have kept the large (cold) one. +; MEMPROF: ![[CONTEXTSIZE2]] = !{i64 -2103941543456458045, i64 10000} + +; MEMPROF: ![[C1]] = !{i64 2732490490862098848} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 6cbe6284d1f0a088b5c6482ae27b738f03d82fe7)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "memprof.cc", directory: "/usr/local/google/home/tejohnson/llvm/tmp", checksumkind: CSK_MD5, checksum: "e8c40ebe4b21776b4d60e9632cbc13c2") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!10 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12) +!11 = !DISubroutineType(types: !12) +!12 = !{} +!13 = !DILocation(line: 5, column: 10, scope: !10) diff --git a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll index 2461ca32e9821..ba53c5797208c 100644 --- a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll +++ b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll @@ -4,24 +4,50 @@ ;; Avoid failures on big-endian systems that can't read the profile properly ; REQUIRES: x86_64-linux -;; TODO: Use text profile inputs once that is available for memprof. -;; # To update the Inputs below, run Inputs/update_memprof_inputs.sh. -;; # To generate below LLVM IR for use in matching. -;; $ clang++ -gmlt -fdebug-info-for-profiling -S %S/Inputs/memprof_loop_unroll_b.cc -emit-llvm +; Generate the profile and the IR. +; RUN: split-file %s %t + +;; Generate indexed profile +; RUN: llvm-profdata merge %t/memprof_loop_unroll.yaml -o %t.memprofdata -; RUN: llvm-profdata merge %S/Inputs/memprof_loop_unroll.memprofraw --profiled-binary %S/Inputs/memprof_loop_unroll.exe -o %t.memprofdata ;; Set the minimum lifetime threshold to 0 to ensure that one context is ;; considered cold (the other will be notcold). -; RUN: opt < %s -passes='memprof-use' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s +; RUN: opt < %t/memprof_loop_unroll.ll -passes='memprof-use' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s -;; Conservatively annotate as not cold. We get two messages as there are two -;; unrolled copies of the allocation. -; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and indistinguishable alloc type notcold: 4 -; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and indistinguishable alloc type notcold: 4 +;; Conservatively annotate as not cold. +; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and single alloc type notcold: 4 ; CHECK: call {{.*}} @_Znam{{.*}} #[[ATTR:[0-9]+]] ; CHECK: attributes #[[ATTR]] = { builtin allocsize(0) "memprof"="notcold" } ; CHECK-NOT: stackIds: () +;--- memprof_loop_unroll.yaml +--- +HeapProfileRecords: + - GUID: 0x7f8d88fcc70a347b + AllocSites: + - Callstack: + - { Function: 0x7f8d88fcc70a347b, LineOffset: 2, Column: 16, IsInlineFrame: false } + - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 4 + TotalLifetime: 2 + TotalLifetimeAccessDensity: 12500000000 + - Callstack: + - { Function: 0x7f8d88fcc70a347b, LineOffset: 2, Column: 16, IsInlineFrame: false } + - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 4 + TotalLifetime: 2 + TotalLifetimeAccessDensity: 0 + - GUID: 0xdb956436e78dd5fa + CallSites: + - Frames: + - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false } +... + +;--- memprof_loop_unroll.ll ; ModuleID = 'memprof_loop_unroll_b.cc' source_filename = "memprof_loop_unroll_b.cc" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"