Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 84 additions & 12 deletions llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ STATISTIC(MissingAllocForContextId,
"Number of missing alloc nodes for context ids");
STATISTIC(SkippedCallsCloning,
"Number of calls skipped during cloning due to unexpected operand");
STATISTIC(MismatchedCloneAssignments,
"Number of callsites assigned to call multiple non-matching clones");

static cl::opt<std::string> DotFilePathPrefix(
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
Expand Down Expand Up @@ -2060,6 +2062,20 @@ static bool isMemProfClone(const Function &F) {
return F.getName().contains(MemProfCloneSuffix);
}

// Return the clone number of the given function by extracting it from the
// memprof suffix. Assumes the caller has already confirmed it is a memprof
// clone.
static unsigned getMemProfCloneNum(const Function &F) {
assert(isMemProfClone(F));
auto Pos = F.getName().find_last_of('.');
assert(Pos > 0);
unsigned CloneNo;
bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
assert(!Err);
(void)Err;
return CloneNo;
}

std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
const Instruction *Call,
unsigned CloneNo) const {
Expand Down Expand Up @@ -3979,7 +3995,22 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {

void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
FuncInfo CalleeFunc) {
if (CalleeFunc.cloneNo() > 0)
auto *CurF = cast<CallBase>(CallerCall.call())->getCalledFunction();
auto NewCalleeCloneNo = CalleeFunc.cloneNo();
if (isMemProfClone(*CurF)) {
// If we already assigned this callsite to call a specific non-default
// clone (i.e. not the original function which is clone 0), ensure that we
// aren't trying to now update it to call a different clone, which is
// indicative of a bug in the graph or function assignment.
auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
if (CurCalleeCloneNo != NewCalleeCloneNo) {
LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
<< CurCalleeCloneNo << " now " << NewCalleeCloneNo
<< "\n");
MismatchedCloneAssignments++;
}
}
if (NewCalleeCloneNo > 0)
cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
OREGetter(CallerCall.call()->getFunction())
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
Expand All @@ -3995,7 +4026,19 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
assert(CI &&
"Caller cannot be an allocation which should not have profiled calls");
assert(CI->Clones.size() > CallerCall.cloneNo());
CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
auto NewCalleeCloneNo = CalleeFunc.cloneNo();
auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
// If we already assigned this callsite to call a specific non-default
// clone (i.e. not the original function which is clone 0), ensure that we
// aren't trying to now update it to call a different clone, which is
// indicative of a bug in the graph or function assignment.
if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
<< CurCalleeCloneNo << " now " << NewCalleeCloneNo
<< "\n");
MismatchedCloneAssignments++;
}
CurCalleeCloneNo = NewCalleeCloneNo;
}

// Update the debug information attached to NewFunc to use the clone Name. Note
Expand Down Expand Up @@ -4703,6 +4746,19 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// where the callers were assigned to different clones of a function.
}

auto FindFirstAvailFuncClone = [&]() {
// Find first function in FuncClonesToCallMap without an assigned
// clone of this callsite Node. We should always have one
// available at this point due to the earlier cloning when the
// FuncClonesToCallMap size was smaller than the clone number.
for (auto &CF : FuncClonesToCallMap) {
if (!FuncCloneToCurNodeCloneMap.count(CF.first))
return CF.first;
}
assert(false &&
"Expected an available func clone for this callsite clone");
};

// See if we can use existing function clone. Walk through
// all caller edges to see if any have already been assigned to
// a clone of this callsite's function. If we can use it, do so. If not,
Expand Down Expand Up @@ -4819,16 +4875,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// clone of OrigFunc for another caller during this iteration over
// its caller edges.
if (!FuncCloneAssignedToCurCallsiteClone) {
// Find first function in FuncClonesToCallMap without an assigned
// clone of this callsite Node. We should always have one
// available at this point due to the earlier cloning when the
// FuncClonesToCallMap size was smaller than the clone number.
for (auto &CF : FuncClonesToCallMap) {
if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
FuncCloneAssignedToCurCallsiteClone = CF.first;
break;
}
}
FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
assert(FuncCloneAssignedToCurCallsiteClone);
// Assign Clone to FuncCloneAssignedToCurCallsiteClone
AssignCallsiteCloneToFuncClone(
Expand All @@ -4842,6 +4889,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
FuncCloneAssignedToCurCallsiteClone);
}
}
// If we didn't assign a function clone to this callsite clone yet, e.g.
// none of its callers has a non-null call, do the assignment here.
// We want to ensure that every callsite clone is assigned to some
// function clone, so that the call updates below work as expected.
// In particular if this is the original callsite, we want to ensure it
// is assigned to the original function, otherwise the original function
// will appear available for assignment to other callsite clones,
// leading to unintended effects. For one, the unknown and not updated
// callers will call into cloned paths leading to the wrong hints,
// because they still call the original function (clone 0). Also,
// because all callsites start out as being clone 0 by default, we can't
// easily distinguish between callsites explicitly assigned to clone 0
// vs those never assigned, which can lead to multiple updates of the
Comment on lines +4903 to +4904

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe if we used a struct instead of an integer to indicate the clone number as value in the map, we could hold additional data about whether it was assigned along with the assigned clone number. Would that simplify any of the logic here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not completely sure which map you are referring to, but I started out making a change to add a bool in the ContextNode as to whether its callsite had been assigned to a func clone, but that turns out to be tricky to get right, and this was more targeted and simpler to get right as a fix for the specific problem I was looking at. In particular, we want callsites that we weren't able to update callers for to get assigned to clone 0, which this change ensures. I've added a TODO though to look at adding a mechanism to better identify and distinguish callsite clones that aren't getting assigned to any clone.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't referring to any particular map, rather some way of noting that the callsite has been assigned. Your response about adding it in ContextNode was helpful. Adding a TODO for now sounds good.

// calls when invoking updateCall below, with mismatched clone values.
// TODO: Add a flag to the callsite nodes or some other mechanism to
// better distinguish and identify callsite clones that are not getting
// assigned to function clones as expected.
if (!FuncCloneAssignedToCurCallsiteClone) {
FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
assert(FuncCloneAssignedToCurCallsiteClone &&
"No available func clone for this callsite clone");
AssignCallsiteCloneToFuncClone(
FuncCloneAssignedToCurCallsiteClone, Call, Clone,
/*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
}
}
if (VerifyCCG) {
checkNode<DerivedCCG, FuncTy, CallTy>(Node);
Expand Down
145 changes: 145 additions & 0 deletions llvm/test/ThinLTO/X86/memprof_func_assign_fix.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
;; Make sure we assign the original callsite to a function clone (which will be
;; the original function clone), even when we cannot update its caller (due to
;; missing metadata e.g. from mismatched profiles). Otherwise we will try to use
;; the original function for a different clone, leading to confusion later when
;; rewriting the calls.

;; -stats requires asserts
; REQUIRES: asserts

; RUN: opt -thinlto-bc %s >%t.o
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
; RUN: -supports-hot-cold-new \
; RUN: -r=%t.o,A,plx \
; RUN: -r=%t.o,B,plx \
; RUN: -r=%t.o,C,plx \
; RUN: -r=%t.o,D,plx \
; RUN: -r=%t.o,E,plx \
; RUN: -r=%t.o,F,plx \
; RUN: -r=%t.o,G,plx \
; RUN: -r=%t.o,A1,plx \
; RUN: -r=%t.o,B1,plx \
; RUN: -r=%t.o,_Znwm, \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -debug-only=memprof-context-disambiguation \
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
; RUN: -o %t.out 2>&1 | FileCheck %s \
; RUN: --implicit-check-not="Mismatch in call clone assignment" \
; RUN: --implicit-check-not="Number of callsites assigned to call multiple non-matching clones"

; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR

; ModuleID = '<stdin>'
source_filename = "reduced.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

; IR-LABEL: define dso_local void @A()
define void @A() #0 {
; IR: call void @C()
call void @C()
ret void
}

; IR-LABEL: define dso_local void @B()
define void @B() #0 {
; IR: call void @C.memprof.1()
call void @C(), !callsite !1
ret void
}

; IR-LABEL: define dso_local void @C()
define void @C() #0 {
; IR: call void @F()
call void @F(), !callsite !16
; IR: call void @D()
call void @D(), !callsite !2
ret void
}

; IR-LABEL: define dso_local void @D()
define void @D() #0 {
; IR: call void @E()
call void @E(), !callsite !3
; IR: call void @G()
call void @G(), !callsite !17
ret void
}

; IR-LABEL: define dso_local void @E()
define void @E() #0 {
; IR: call ptr @_Znwm(i64 0) #[[NOTCOLD:[0-9]+]]
%1 = call ptr @_Znwm(i64 0), !memprof !4, !callsite !9
ret void
}

; IR-LABEL: define dso_local void @F()
define void @F() #0 {
; IR: call void @G()
call void @G(), !callsite !17
ret void
}

; IR-LABEL: define dso_local void @G()
define void @G() #0 {
; IR: call ptr @_Znwm(i64 0) #[[NOTCOLD]]
%2 = call ptr @_Znwm(i64 0), !memprof !10, !callsite !15
ret void
}

; IR-LABEL: define dso_local void @A1()
define void @A1() #0 {
; IR: call void @C()
call void @C(), !callsite !18
ret void
}

; IR-LABEL: define dso_local void @B1()
define void @B1() #0 {
; IR: call void @C.memprof.1()
call void @C(), !callsite !19
ret void
}

; IR-LABEL: define dso_local void @C.memprof.1()
; IR: call void @F.memprof.1()
; IR: call void @D.memprof.1()

; IR-LABEL: define dso_local void @D.memprof.1()
; IR: call void @E.memprof.1()
; IR: call void @G()

; IR-LABEL: define dso_local void @E.memprof.1()
; IR: call ptr @_Znwm(i64 0) #[[COLD:[0-9]+]]

; IR-LABEL: define dso_local void @F.memprof.1()
; IR: call void @G.memprof.1()

; IR-LABEL: define dso_local void @G.memprof.1()
; IR: call ptr @_Znwm(i64 0) #[[COLD]]

declare ptr @_Znwm(i64)

attributes #0 = { noinline optnone }
; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
; IR: attributes #[[COLD]] = { "memprof"="cold" }

!0 = !{i64 123}
!1 = !{i64 234}
!2 = !{i64 345}
!3 = !{i64 456}
!4 = !{!5, !7}
!5 = !{!6, !"notcold"}
!6 = !{i64 567, i64 456, i64 345, i64 123}
!7 = !{!8, !"cold"}
!8 = !{i64 567, i64 456, i64 345, i64 234}
!9 = !{i64 567}
!10 = !{!11, !13}
!11 = !{!12, !"notcold"}
!12 = !{i64 678, i64 891, i64 789, i64 912}
!13 = !{!14, !"cold"}
!14 = !{i64 678, i64 891, i64 789, i64 812}
!15 = !{i64 678}
!16 = !{i64 789}
!17 = !{i64 891}
!18 = !{i64 912}
!19 = !{i64 812}
Loading