[BOLT] Explicitly check for returns when extending call continuation profile (#143295)
Call continuation logic relies on assumptions about fall-through origin: - the branch is external to the function, - fall-through start is at the beginning of the block, - the block is not an entry point or a landing pad. Leverage trace information to explicitly check whether the origin is a return instruction, and defer to checks above only in case of DSO-external branch source. This covers both regular and BAT cases, addressing call continuation fall-through undercounting in the latter mode, which improves BAT profile quality metrics. For example, for one large binary: - CFG discontinuity 21.83% -> 0.00%, - CFG flow imbalance 10.77%/100.00% -> 3.40%/13.82% (weighted/worst) - CG flow imbalance 8.49% —> 8.49%. Depends on #143289. Test Plan: updated callcont-fallthru.s
This commit is contained in:
parent
816ab1af0d
commit
9fed480f18
@ -132,6 +132,9 @@ private:
|
|||||||
/// and use them later for processing and assigning profile.
|
/// and use them later for processing and assigning profile.
|
||||||
std::unordered_map<Trace, TakenBranchInfo, TraceHash> TraceMap;
|
std::unordered_map<Trace, TakenBranchInfo, TraceHash> TraceMap;
|
||||||
std::vector<std::pair<Trace, TakenBranchInfo>> Traces;
|
std::vector<std::pair<Trace, TakenBranchInfo>> Traces;
|
||||||
|
/// Pre-populated addresses of returns, coming from pre-aggregated data or
|
||||||
|
/// disassembly. Used to disambiguate call-continuation fall-throughs.
|
||||||
|
std::unordered_set<uint64_t> Returns;
|
||||||
std::unordered_map<uint64_t, uint64_t> BasicSamples;
|
std::unordered_map<uint64_t, uint64_t> BasicSamples;
|
||||||
std::vector<PerfMemSample> MemSamples;
|
std::vector<PerfMemSample> MemSamples;
|
||||||
|
|
||||||
@ -204,8 +207,8 @@ private:
|
|||||||
/// Return a vector of offsets corresponding to a trace in a function
|
/// Return a vector of offsets corresponding to a trace in a function
|
||||||
/// if the trace is valid, std::nullopt otherwise.
|
/// if the trace is valid, std::nullopt otherwise.
|
||||||
std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
|
std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
|
||||||
getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
|
getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, uint64_t Count,
|
||||||
uint64_t Count) const;
|
bool IsReturn) const;
|
||||||
|
|
||||||
/// Record external entry into the function \p BF.
|
/// Record external entry into the function \p BF.
|
||||||
///
|
///
|
||||||
@ -265,11 +268,14 @@ private:
|
|||||||
uint64_t From, uint64_t To, uint64_t Count,
|
uint64_t From, uint64_t To, uint64_t Count,
|
||||||
uint64_t Mispreds);
|
uint64_t Mispreds);
|
||||||
|
|
||||||
|
/// Checks if \p Addr corresponds to a return instruction.
|
||||||
|
bool checkReturn(uint64_t Addr);
|
||||||
|
|
||||||
/// Register a \p Branch.
|
/// Register a \p Branch.
|
||||||
bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
|
bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
|
||||||
|
|
||||||
/// Register a trace between two LBR entries supplied in execution order.
|
/// Register a trace between two LBR entries supplied in execution order.
|
||||||
bool doTrace(const Trace &Trace, uint64_t Count);
|
bool doTrace(const Trace &Trace, uint64_t Count, bool IsReturn);
|
||||||
|
|
||||||
/// Parser helpers
|
/// Parser helpers
|
||||||
/// Return false if we exhausted our parser buffer and finished parsing
|
/// Return false if we exhausted our parser buffer and finished parsing
|
||||||
|
@ -730,50 +730,54 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool DataAggregator::checkReturn(uint64_t Addr) {
|
||||||
|
auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
|
||||||
|
if (llvm::is_contained(Returns, Addr))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
|
||||||
|
if (!Func)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const uint64_t Offset = Addr - Func->getAddress();
|
||||||
|
if (Func->hasInstructions()
|
||||||
|
? isReturn(Func->getInstructionAtOffset(Offset))
|
||||||
|
: isReturn(Func->disassembleInstructionAtOffset(Offset))) {
|
||||||
|
Returns.emplace(Addr);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
|
bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
|
||||||
uint64_t Mispreds) {
|
uint64_t Mispreds) {
|
||||||
// Returns whether \p Offset in \p Func contains a return instruction.
|
|
||||||
auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) {
|
|
||||||
auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
|
|
||||||
return Func.hasInstructions()
|
|
||||||
? isReturn(Func.getInstructionAtOffset(Offset))
|
|
||||||
: isReturn(Func.disassembleInstructionAtOffset(Offset));
|
|
||||||
};
|
|
||||||
|
|
||||||
// Mutates \p Addr to an offset into the containing function, performing BAT
|
// Mutates \p Addr to an offset into the containing function, performing BAT
|
||||||
// offset translation and parent lookup.
|
// offset translation and parent lookup.
|
||||||
//
|
//
|
||||||
// Returns the containing function (or BAT parent) and whether the address
|
// Returns the containing function (or BAT parent).
|
||||||
// corresponds to a return (if \p IsFrom) or a call continuation (otherwise).
|
|
||||||
auto handleAddress = [&](uint64_t &Addr, bool IsFrom) {
|
auto handleAddress = [&](uint64_t &Addr, bool IsFrom) {
|
||||||
BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
|
BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
|
||||||
if (!Func) {
|
if (!Func) {
|
||||||
Addr = 0;
|
Addr = 0;
|
||||||
return std::pair{Func, false};
|
return Func;
|
||||||
}
|
}
|
||||||
|
|
||||||
Addr -= Func->getAddress();
|
Addr -= Func->getAddress();
|
||||||
|
|
||||||
bool IsRet = IsFrom && checkReturn(*Func, Addr);
|
|
||||||
|
|
||||||
if (BAT)
|
if (BAT)
|
||||||
Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
|
Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
|
||||||
|
|
||||||
if (BinaryFunction *ParentFunc = getBATParentFunction(*Func))
|
if (BinaryFunction *ParentFunc = getBATParentFunction(*Func))
|
||||||
Func = ParentFunc;
|
return ParentFunc;
|
||||||
|
|
||||||
return std::pair{Func, IsRet};
|
return Func;
|
||||||
};
|
};
|
||||||
|
|
||||||
auto [FromFunc, IsReturn] = handleAddress(From, /*IsFrom*/ true);
|
BinaryFunction *FromFunc = handleAddress(From, /*IsFrom*/ true);
|
||||||
auto [ToFunc, _] = handleAddress(To, /*IsFrom*/ false);
|
BinaryFunction *ToFunc = handleAddress(To, /*IsFrom*/ false);
|
||||||
if (!FromFunc && !ToFunc)
|
if (!FromFunc && !ToFunc)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// Ignore returns.
|
|
||||||
if (IsReturn)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
// Treat recursive control transfers as inter-branches.
|
// Treat recursive control transfers as inter-branches.
|
||||||
if (FromFunc == ToFunc && To != 0) {
|
if (FromFunc == ToFunc && To != 0) {
|
||||||
recordBranch(*FromFunc, From, To, Count, Mispreds);
|
recordBranch(*FromFunc, From, To, Count, Mispreds);
|
||||||
@ -783,7 +787,8 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
|
|||||||
return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds);
|
return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
|
bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count,
|
||||||
|
bool IsReturn) {
|
||||||
const uint64_t From = Trace.From, To = Trace.To;
|
const uint64_t From = Trace.From, To = Trace.To;
|
||||||
BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From);
|
BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From);
|
||||||
BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To);
|
BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To);
|
||||||
@ -808,8 +813,8 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
|
|||||||
const uint64_t FuncAddress = FromFunc->getAddress();
|
const uint64_t FuncAddress = FromFunc->getAddress();
|
||||||
std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
|
std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
|
||||||
BAT && BAT->isBATFunction(FuncAddress)
|
BAT && BAT->isBATFunction(FuncAddress)
|
||||||
? BAT->getFallthroughsInTrace(FuncAddress, From, To)
|
? BAT->getFallthroughsInTrace(FuncAddress, From - IsReturn, To)
|
||||||
: getFallthroughsInTrace(*FromFunc, Trace, Count);
|
: getFallthroughsInTrace(*FromFunc, Trace, Count, IsReturn);
|
||||||
if (!FTs) {
|
if (!FTs) {
|
||||||
LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n');
|
LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n');
|
||||||
NumInvalidTraces += Count;
|
NumInvalidTraces += Count;
|
||||||
@ -831,7 +836,7 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
|
|||||||
|
|
||||||
std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
|
std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
|
||||||
DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
|
DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
|
||||||
uint64_t Count) const {
|
uint64_t Count, bool IsReturn) const {
|
||||||
SmallVector<std::pair<uint64_t, uint64_t>, 16> Branches;
|
SmallVector<std::pair<uint64_t, uint64_t>, 16> Branches;
|
||||||
|
|
||||||
BinaryContext &BC = BF.getBinaryContext();
|
BinaryContext &BC = BF.getBinaryContext();
|
||||||
@ -865,9 +870,13 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
|
|||||||
|
|
||||||
// Adjust FromBB if the first LBR is a return from the last instruction in
|
// Adjust FromBB if the first LBR is a return from the last instruction in
|
||||||
// the previous block (that instruction should be a call).
|
// the previous block (that instruction should be a call).
|
||||||
if (Trace.Branch != Trace::FT_ONLY && !BF.containsAddress(Trace.Branch) &&
|
if (IsReturn) {
|
||||||
From == FromBB->getOffset() && !FromBB->isEntryPoint() &&
|
if (From)
|
||||||
!FromBB->isLandingPad()) {
|
FromBB = BF.getBasicBlockContainingOffset(From - 1);
|
||||||
|
else
|
||||||
|
LLVM_DEBUG(dbgs() << "return to the function start: " << Trace << '\n');
|
||||||
|
} else if (Trace.Branch == Trace::EXTERNAL && From == FromBB->getOffset() &&
|
||||||
|
!FromBB->isEntryPoint() && !FromBB->isLandingPad()) {
|
||||||
const BinaryBasicBlock *PrevBB =
|
const BinaryBasicBlock *PrevBB =
|
||||||
BF.getLayout().getBlock(FromBB->getIndex() - 1);
|
BF.getLayout().getBlock(FromBB->getIndex() - 1);
|
||||||
if (PrevBB->getSuccessor(FromBB->getLabel())) {
|
if (PrevBB->getSuccessor(FromBB->getLabel())) {
|
||||||
@ -1557,11 +1566,13 @@ void DataAggregator::processBranchEvents() {
|
|||||||
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
|
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
|
||||||
|
|
||||||
for (const auto &[Trace, Info] : Traces) {
|
for (const auto &[Trace, Info] : Traces) {
|
||||||
if (Trace.Branch != Trace::FT_ONLY &&
|
bool IsReturn = checkReturn(Trace.Branch);
|
||||||
|
// Ignore returns.
|
||||||
|
if (!IsReturn && Trace.Branch != Trace::FT_ONLY &&
|
||||||
Trace.Branch != Trace::FT_EXTERNAL_ORIGIN)
|
Trace.Branch != Trace::FT_EXTERNAL_ORIGIN)
|
||||||
doBranch(Trace.Branch, Trace.From, Info.TakenCount, Info.MispredCount);
|
doBranch(Trace.Branch, Trace.From, Info.TakenCount, Info.MispredCount);
|
||||||
if (Trace.To != Trace::BR_ONLY)
|
if (Trace.To != Trace::BR_ONLY)
|
||||||
doTrace(Trace, Info.TakenCount);
|
doTrace(Trace, Info.TakenCount, IsReturn);
|
||||||
}
|
}
|
||||||
printBranchSamplesDiagnostics();
|
printBranchSamplesDiagnostics();
|
||||||
}
|
}
|
||||||
|
@ -4,29 +4,43 @@
|
|||||||
# RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
|
# RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
|
||||||
## Link against a DSO to ensure PLT entries.
|
## Link against a DSO to ensure PLT entries.
|
||||||
# RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib
|
# RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib
|
||||||
# RUN: link_fdata %s %t %t.pat PREAGGT1
|
# Trace to a call continuation, not a landing pad/entry point
|
||||||
# RUN: link_fdata %s %t %t.pat2 PREAGGT2
|
# RUN: link_fdata %s %t %t.pa-base PREAGG-BASE
|
||||||
# RUN-DISABLED: link_fdata %s %t %t.patplt PREAGGPLT
|
# Trace from a return to a landing pad/entry point call continuation
|
||||||
|
# RUN: link_fdata %s %t %t.pa-ret PREAGG-RET
|
||||||
|
# Trace from an external location to a landing pad/entry point call continuation
|
||||||
|
# RUN: link_fdata %s %t %t.pa-ext PREAGG-EXT
|
||||||
|
# RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT
|
||||||
|
|
||||||
# RUN: llvm-strip --strip-unneeded %t -o %t.strip
|
# RUN: llvm-strip --strip-unneeded %t -o %t.strip
|
||||||
# RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
|
# RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
|
||||||
|
|
||||||
## Check pre-aggregated traces attach call continuation fallthrough count
|
## Check pre-aggregated traces attach call continuation fallthrough count
|
||||||
# RUN: llvm-bolt %t.noeh --pa -p %t.pat -o %t.out \
|
## in the basic case (not an entry point, not a landing pad).
|
||||||
# RUN: --print-cfg --print-only=main | FileCheck %s
|
# RUN: llvm-bolt %t.noeh --pa -p %t.pa-base -o %t.out \
|
||||||
|
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-BASE
|
||||||
|
|
||||||
## Check pre-aggregated traces don't attach call continuation fallthrough count
|
## Check pre-aggregated traces from a return attach call continuation
|
||||||
## to secondary entry point (unstripped)
|
## fallthrough count to secondary entry point (unstripped)
|
||||||
# RUN: llvm-bolt %t --pa -p %t.pat2 -o %t.out \
|
# RUN: llvm-bolt %t --pa -p %t.pa-ret -o %t.out \
|
||||||
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
|
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
|
||||||
## Check pre-aggregated traces don't attach call continuation fallthrough count
|
## Check pre-aggregated traces from a return attach call continuation
|
||||||
## to landing pad (stripped, LP)
|
## fallthrough count to landing pad (stripped, landing pad)
|
||||||
# RUN: llvm-bolt %t.strip --pa -p %t.pat2 -o %t.out \
|
# RUN: llvm-bolt %t.strip --pa -p %t.pa-ret -o %t.out \
|
||||||
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
|
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
|
||||||
|
|
||||||
|
## Check pre-aggregated traces from external location don't attach call
|
||||||
|
## continuation fallthrough count to secondary entry point (unstripped)
|
||||||
|
# RUN: llvm-bolt %t --pa -p %t.pa-ext -o %t.out \
|
||||||
|
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
|
||||||
|
## Check pre-aggregated traces from external location don't attach call
|
||||||
|
## continuation fallthrough count to landing pad (stripped, landing pad)
|
||||||
|
# RUN: llvm-bolt %t.strip --pa -p %t.pa-ext -o %t.out \
|
||||||
|
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
|
||||||
|
|
||||||
## Check pre-aggregated traces don't report zero-sized PLT fall-through as
|
## Check pre-aggregated traces don't report zero-sized PLT fall-through as
|
||||||
## invalid trace
|
## invalid trace
|
||||||
# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.patplt -o %t.out | FileCheck %s \
|
# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.pa-plt -o %t.out | FileCheck %s \
|
||||||
# RUN-DISABLED: --check-prefix=CHECK-PLT
|
# RUN-DISABLED: --check-prefix=CHECK-PLT
|
||||||
# CHECK-PLT: traces mismatching disassembled function contents: 0
|
# CHECK-PLT: traces mismatching disassembled function contents: 0
|
||||||
|
|
||||||
@ -56,11 +70,11 @@ main:
|
|||||||
Ltmp0_br:
|
Ltmp0_br:
|
||||||
callq puts@PLT
|
callq puts@PLT
|
||||||
## Check PLT traces are accepted
|
## Check PLT traces are accepted
|
||||||
# PREAGGPLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3
|
# PREAGG-PLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3
|
||||||
## Target is an external-origin call continuation
|
## Target is an external-origin call continuation
|
||||||
# PREAGGT1: T X:0 #Ltmp1# #Ltmp4_br# 2
|
# PREAGG-BASE: T X:0 #Ltmp1# #Ltmp4_br# 2
|
||||||
# CHECK: callq puts@PLT
|
# CHECK-BASE: callq puts@PLT
|
||||||
# CHECK-NEXT: count: 2
|
# CHECK-BASE-NEXT: count: 2
|
||||||
|
|
||||||
Ltmp1:
|
Ltmp1:
|
||||||
movq -0x10(%rbp), %rax
|
movq -0x10(%rbp), %rax
|
||||||
@ -71,24 +85,18 @@ Ltmp4:
|
|||||||
cmpl $0x0, -0x14(%rbp)
|
cmpl $0x0, -0x14(%rbp)
|
||||||
Ltmp4_br:
|
Ltmp4_br:
|
||||||
je Ltmp0
|
je Ltmp0
|
||||||
# CHECK2: je .Ltmp0
|
|
||||||
# CHECK2-NEXT: count: 3
|
|
||||||
|
|
||||||
movl $0xa, -0x18(%rbp)
|
movl $0xa, -0x18(%rbp)
|
||||||
callq foo
|
callq foo
|
||||||
## Target is a binary-local call continuation
|
## Target is a binary-local call continuation
|
||||||
# PREAGGT1: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
|
# PREAGG-RET: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
|
||||||
# CHECK: callq foo
|
|
||||||
# CHECK-NEXT: count: 1
|
|
||||||
|
|
||||||
## PLT call continuation fallthrough spanning the call
|
|
||||||
# CHECK2: callq foo
|
|
||||||
# CHECK2-NEXT: count: 3
|
|
||||||
|
|
||||||
## Target is a secondary entry point (unstripped) or a landing pad (stripped)
|
## Target is a secondary entry point (unstripped) or a landing pad (stripped)
|
||||||
# PREAGGT2: T X:0 #Ltmp3# #Ltmp3_br# 2
|
# PREAGG-EXT: T X:0 #Ltmp3# #Ltmp3_br# 1
|
||||||
# CHECK3: callq foo
|
|
||||||
# CHECK3-NEXT: count: 0
|
# CHECK-ATTACH: callq foo
|
||||||
|
# CHECK-ATTACH-NEXT: count: 1
|
||||||
|
# CHECK-SKIP: callq foo
|
||||||
|
# CHECK-SKIP-NEXT: count: 0
|
||||||
|
|
||||||
Ltmp3:
|
Ltmp3:
|
||||||
cmpl $0x0, -0x18(%rbp)
|
cmpl $0x0, -0x18(%rbp)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user