diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 576a31f8b86ae..27049d547f6e3 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -1377,9 +1377,12 @@ void CallsiteContextGraph:: // Compute the last node's context ids once, as it is shared by all calls in // this entry. DenseSet LastNodeContextIds = LastNode->getContextIds(); - assert(!LastNodeContextIds.empty()); - for (unsigned I = 0; I < Calls.size(); I++) { + bool PrevIterCreatedNode = false; + bool CreatedNode = false; + for (unsigned I = 0; I < Calls.size(); + I++, PrevIterCreatedNode = CreatedNode) { + CreatedNode = false; auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; // Skip any for which we didn't assign any ids, these don't get a node in // the graph. @@ -1391,7 +1394,13 @@ void CallsiteContextGraph:: if (!CallToMatchingCall.contains(Call)) continue; auto MatchingCall = CallToMatchingCall[Call]; - assert(NonAllocationCallToContextNodeMap.contains(MatchingCall)); + if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) { + // This should only happen if we had a prior iteration, and it didn't + // create a node because of the below recomputation of context ids + // finding none remaining and continuing early. + assert(I > 0 && !PrevIterCreatedNode); + continue; + } NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back( Call); continue; @@ -1444,6 +1453,7 @@ void CallsiteContextGraph:: ContextNode *NewNode = NodeOwner.back().get(); NodeToCallingFunc[NewNode] = Func; NonAllocationCallToContextNodeMap[Call] = NewNode; + CreatedNode = true; NewNode->AllocTypes = computeAllocType(SavedContextIds); ContextNode *FirstNode = getNodeForStackId(Ids[0]); @@ -1548,13 +1558,23 @@ void CallsiteContextGraph::updateStackNodes() { // of length, and within each length, lexicographically by stack id. The // latter is so that we can specially handle calls that have identical stack // id sequences (either due to cloning or artificially because of the MIB - // context pruning). - std::stable_sort(Calls.begin(), Calls.end(), - [](const CallContextInfo &A, const CallContextInfo &B) { - return A.StackIds.size() > B.StackIds.size() || - (A.StackIds.size() == B.StackIds.size() && - A.StackIds < B.StackIds); - }); + // context pruning). Those with the same Ids are then sorted by function to + // facilitate efficiently mapping them to the same context node. + // Because the functions are pointers, to ensure a stable sort first assign + // each function pointer to its first index in the Calls array, and then use + // that to sort by. + DenseMap FuncToIndex; + for (const auto &[Idx, CallCtxInfo] : enumerate(Calls)) + FuncToIndex.insert({CallCtxInfo.Func, Idx}); + std::stable_sort( + Calls.begin(), Calls.end(), + [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) { + return A.StackIds.size() > B.StackIds.size() || + (A.StackIds.size() == B.StackIds.size() && + (A.StackIds < B.StackIds || + (A.StackIds == B.StackIds && + FuncToIndex[A.Func] < FuncToIndex[B.Func]))); + }); // Find the node for the last stack id, which should be the same // across all calls recorded for this id, and is the id for this @@ -1572,18 +1592,26 @@ void CallsiteContextGraph::updateStackNodes() { DenseSet LastNodeContextIds = LastNode->getContextIds(); assert(!LastNodeContextIds.empty()); - // Map from function to the first call from the below list (with matching - // stack ids) found in that function. Note that calls from different - // functions can have the same stack ids because this is the list of stack - // ids that had (possibly pruned) nodes after building the graph from the - // allocation MIBs. - DenseMap FuncToCallMap; +#ifndef NDEBUG + // Save the set of functions seen for a particular set of the same stack + // ids. This is used to ensure that they have been correctly sorted to be + // adjacent in the Calls list, since we rely on that to efficiently place + // all such matching calls onto the same context node. + DenseSet MatchingIdsFuncSet; +#endif for (unsigned I = 0; I < Calls.size(); I++) { auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; assert(SavedContextIds.empty()); assert(LastId == Ids.back()); +#ifndef NDEBUG + // If this call has a different set of ids than the last one, clear the + // set used to ensure they are sorted properly. + if (I > 0 && Ids != Calls[I - 1].StackIds) + MatchingIdsFuncSet.clear(); +#endif + // First compute the context ids for this stack id sequence (the // intersection of the context ids of the corresponding nodes). // Start with the remaining saved ids for the last node. @@ -1652,23 +1680,38 @@ void CallsiteContextGraph::updateStackNodes() { continue; } - // If the prior call had the same stack ids this map would not be empty. +#ifndef NDEBUG + // If the prior call had the same stack ids this set would not be empty. // Check if we already have a call that "matches" because it is located - // in the same function. - if (FuncToCallMap.contains(Func)) { - // Record the matching call found for this call, and skip it. We - // will subsequently combine it into the same node. - CallToMatchingCall[Call] = FuncToCallMap[Func]; - continue; - } + // in the same function. If the Calls list was sorted properly we should + // not encounter this situation as all such entries should be adjacent + // and processed in bulk further below. + assert(!MatchingIdsFuncSet.contains(Func)); + + MatchingIdsFuncSet.insert(Func); +#endif // Check if the next set of stack ids is the same (since the Calls vector // of tuples is sorted by the stack ids we can just look at the next one). + // If so, save them in the CallToMatchingCall map so that they get + // assigned to the same context node, and skip them. bool DuplicateContextIds = false; - if (I + 1 < Calls.size()) { - auto &CallCtxInfo = Calls[I + 1]; + for (unsigned J = I + 1; J < Calls.size(); J++) { + auto &CallCtxInfo = Calls[J]; auto &NextIds = CallCtxInfo.StackIds; - DuplicateContextIds = Ids == NextIds; + if (NextIds != Ids) + break; + auto *NextFunc = CallCtxInfo.Func; + if (NextFunc != Func) { + // We have another Call with the same ids but that cannot share this + // node, must duplicate ids for it. + DuplicateContextIds = true; + break; + } + auto &NextCall = CallCtxInfo.Call; + CallToMatchingCall[NextCall] = Call; + // Update I so that it gets incremented correctly to skip this call. + I = J; } // If we don't have duplicate context ids, then we can assign all the @@ -1692,14 +1735,7 @@ void CallsiteContextGraph::updateStackNodes() { set_subtract(LastNodeContextIds, StackSequenceContextIds); if (LastNodeContextIds.empty()) break; - // No longer possibly in a sequence of calls with duplicate stack ids, - // clear the map. - FuncToCallMap.clear(); - } else - // Record the call with its function, so we can locate it the next time - // we find a call from this function when processing the calls with the - // same stack ids. - FuncToCallMap[Func] = Call; + } } } diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll new file mode 100644 index 0000000000000..bf419ea987bd0 --- /dev/null +++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll @@ -0,0 +1,102 @@ +;; This test ensures that the logic which assigns calls to stack nodes +;; correctly handles a case where multiple nodes have stack ids that +;; overlap with each other but have different last nodes (can happen with +;; inlining into various levels of a call chain). Specifically, when we +;; have one that is duplicated (e.g. unrolling), we need to correctly +;; handle the case where the context id has already been assigned to +;; a different call with a different last node. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS + +; REMARKS: created clone _Z1Ab.memprof.1 +; REMARKS: created clone _Z3XZNv.memprof.1 +; REMARKS: call in clone main assigned to call function clone _Z3XZNv.memprof.1 +;; Make sure the inlined context in _Z3XZNv, which partially overlaps the stack +;; ids in the shorter inlined context of Z2XZv, correctly calls a cloned +;; version of Z1Ab, which will call the cold annotated allocation. +; REMARKS: call in clone _Z3XZNv.memprof.1 assigned to call function clone _Z1Ab.memprof.1 +; REMARKS: call in clone _Z1Ab.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone main assigned to call function clone _Z3XZNv +; REMARKS: call in clone _Z3XZNv assigned to call function clone _Z1Ab +; REMARKS: call in clone _Z1Ab marked with memprof allocation attribute notcold + + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local void @_Z1Ab() { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #1, !memprof !0, !callsite !5 + ret void +} + +; Function Attrs: nobuiltin +declare ptr @_Znam(i64) #0 + +;; Inlining of stack id 2 into 3. Assume this is called from somewhere else. +define dso_local void @_Z2XZv() local_unnamed_addr #0 { +entry: + ;; Simulate duplication of the callsite (e.g. unrolling). + call void @_Z1Ab(), !callsite !6 + call void @_Z1Ab(), !callsite !6 + ret void +} + +;; Inlining of stack id 2 into 3 into 4. Called by main below. +define dso_local void @_Z3XZNv() local_unnamed_addr { +entry: + call void @_Z1Ab(), !callsite !7 + ret void +} + +define dso_local noundef i32 @main() local_unnamed_addr { +entry: + call void @_Z3XZNv(), !callsite !8 ;; Not cold context + call void @_Z3XZNv(), !callsite !9 ;; Cold context + ret i32 0 +} + +attributes #0 = { nobuiltin } +attributes #7 = { builtin } + +!0 = !{!1, !3} +;; Not cold context via first call to _Z3XZNv in main +!1 = !{!2, !"notcold"} +!2 = !{i64 1, i64 2, i64 3, i64 4, i64 5} +;; Cold context via second call to _Z3XZNv in main +!3 = !{!4, !"cold"} +!4 = !{i64 1, i64 2, i64 3, i64 4, i64 6} +!5 = !{i64 1} +!6 = !{i64 2, i64 3} +!7 = !{i64 2, i64 3, i64 4} +!8 = !{i64 5} +!9 = !{i64 6} + +; IR: define {{.*}} @_Z1Ab() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define {{.*}} @_Z2XZv() +; IR: call {{.*}} @_Z1Ab() +; IR: call {{.*}} @_Z1Ab() +; IR: define {{.*}} @_Z3XZNv() +; IR: call {{.*}} @_Z1Ab() +; IR: define {{.*}} @main() +; IR: call {{.*}} @_Z3XZNv() +; IR: call {{.*}} @_Z3XZNv.memprof.1() +; IR: define {{.*}} @_Z1Ab.memprof.1() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: define {{.*}} @_Z3XZNv.memprof.1() +; IR: call {{.*}} @_Z1Ab.memprof.1() + +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis