[BOLT] Support pre-aggregated returns (#143296)
Intel's Architectural LBR supports capturing branch type information as part of LBR stack (SDM Vol 3B, part 2, October 2024): ``` 20.1.3.2 Branch Types The IA32_LBR_x_INFO.BR_TYPE and IA32_LER_INFO.BR_TYPE fields encode the branch types as shown in Table 20-3. Table 20-3. IA32_LBR_x_INFO and IA32_LER_INFO Branch Type Encodings Encoding | Branch Type 0000B | COND 0001B | NEAR_IND_JMP 0010B | NEAR_REL_JMP 0011B | NEAR_IND_CALL 0100B | NEAR_REL_CALL 0101B | NEAR_RET 011xB | Reserved 1xxxB | OTHER_BRANCH For a list of branch operations that fall into the categories above, see Table 20-2. Table 20-2. Branch Type Filtering Details Branch Type | Operations Recorded COND | Jcc, J*CXZ, and LOOP* NEAR_IND_JMP | JMP r/m* NEAR_REL_JMP | JMP rel* NEAR_IND_CALL | CALL r/m* NEAR_REL_CALL | CALL rel* (excluding CALLs to the next sequential IP) NEAR_RET | RET (0C3H) OTHER_BRANCH | JMP/CALL ptr*, JMP/CALL m*, RET (0C8H), SYS*, interrupts, exceptions (other than debug exceptions), IRET, INT3, INTn, INTO, TSX Abort, EENTER, ERESUME, EEXIT, AEX, INIT, SIPI, RSM ``` Linux kernel can preserve branch type when `save_type` is enabled, even if CPU does not support Architectural LBR:f09079bd04/tools/perf/Documentation/perf-record.txt (L457-L460)
> - save_type: save branch type during sampling in case binary is not available later. For the platforms with Intel Arch LBR support (12th-Gen+ client or 4th-Gen Xeon+ server), the save branch type is unconditionally enabled when the taken branch stack sampling is enabled. Kernel-reported branch type values:8c6bc74c7f/include/uapi/linux/perf_event.h (L251-L269)
This information is needed to disambiguate external returns (from DSO/JIT) to an entry point or a landing pad, when BOLT can't disassemble the branch source. This patch adds new pre-aggregated types: - return trace (R), - external return fall-through (r). For such types, the checks for fall-through start (not an entry or a landing pad) are relaxed. Depends on #143295. Test Plan: updated callcont-fallthru.s
This commit is contained in:
parent
a5fa5bd2a8
commit
7085065c02
@ -101,16 +101,17 @@ private:
|
||||
uint64_t Addr;
|
||||
};
|
||||
|
||||
/// Container for the unit of branch data.
|
||||
/// Backwards compatible with legacy use for branches and fall-throughs:
|
||||
/// - if \p Branch is FT_ONLY or FT_EXTERNAL_ORIGIN, the trace only
|
||||
/// contains fall-through data,
|
||||
/// - if \p To is BR_ONLY, the trace only contains branch data.
|
||||
/// Container for the unit of branch data, matching pre-aggregated trace type.
|
||||
/// Backwards compatible with branch and fall-through types:
|
||||
/// - if \p To is < 0, the trace only contains branch data (BR_ONLY),
|
||||
/// - if \p Branch is < 0, the trace only contains fall-through data
|
||||
/// (FT_ONLY, FT_EXTERNAL_ORIGIN, or FT_EXTERNAL_RETURN).
|
||||
struct Trace {
|
||||
static constexpr const uint64_t EXTERNAL = 0ULL;
|
||||
static constexpr const uint64_t BR_ONLY = -1ULL;
|
||||
static constexpr const uint64_t FT_ONLY = -1ULL;
|
||||
static constexpr const uint64_t FT_EXTERNAL_ORIGIN = -2ULL;
|
||||
static constexpr const uint64_t FT_EXTERNAL_RETURN = -3ULL;
|
||||
|
||||
uint64_t Branch;
|
||||
uint64_t From;
|
||||
@ -390,9 +391,9 @@ private:
|
||||
/// File format syntax:
|
||||
/// E <event>
|
||||
/// S <start> <count>
|
||||
/// T <start> <end> <ft_end> <count>
|
||||
/// [TR] <start> <end> <ft_end> <count>
|
||||
/// B <start> <end> <count> <mispred_count>
|
||||
/// [Ff] <start> <end> <count>
|
||||
/// [Ffr] <start> <end> <count>
|
||||
///
|
||||
/// where <start>, <end>, <ft_end> have the format [<id>:]<offset>
|
||||
///
|
||||
@ -403,8 +404,11 @@ private:
|
||||
/// f - an aggregated fall-through with external origin - used to disambiguate
|
||||
/// between a return hitting a basic block head and a regular internal
|
||||
/// jump to the block
|
||||
/// r - an aggregated fall-through originating at an external return, no
|
||||
/// checks are performed for a fallthrough start
|
||||
/// T - an aggregated trace: branch from <start> to <end> with a fall-through
|
||||
/// to <ft_end>
|
||||
/// R - an aggregated trace originating at a return
|
||||
///
|
||||
/// <id> - build id of the object containing the address. We can skip it for
|
||||
/// the main binary and use "X" for an unknown object. This will save some
|
||||
@ -532,7 +536,12 @@ inline raw_ostream &operator<<(raw_ostream &OS,
|
||||
const DataAggregator::Trace &T) {
|
||||
switch (T.Branch) {
|
||||
case DataAggregator::Trace::FT_ONLY:
|
||||
break;
|
||||
case DataAggregator::Trace::FT_EXTERNAL_ORIGIN:
|
||||
OS << "X:0 -> ";
|
||||
break;
|
||||
case DataAggregator::Trace::FT_EXTERNAL_RETURN:
|
||||
OS << "X:R -> ";
|
||||
break;
|
||||
default:
|
||||
OS << Twine::utohexstr(T.Branch) << " -> ";
|
||||
|
@ -537,8 +537,7 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
|
||||
|
||||
heatmap:
|
||||
// Sort parsed traces for faster processing.
|
||||
if (!opts::BasicAggregation)
|
||||
llvm::sort(Traces, llvm::less_first());
|
||||
llvm::sort(Traces, llvm::less_first());
|
||||
|
||||
if (!opts::HeatmapMode)
|
||||
return Error::success();
|
||||
@ -883,13 +882,9 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
|
||||
|
||||
// Adjust FromBB if the first LBR is a return from the last instruction in
|
||||
// the previous block (that instruction should be a call).
|
||||
if (IsReturn) {
|
||||
if (From)
|
||||
FromBB = BF.getBasicBlockContainingOffset(From - 1);
|
||||
else
|
||||
LLVM_DEBUG(dbgs() << "return to the function start: " << Trace << '\n');
|
||||
} else if (Trace.Branch == Trace::EXTERNAL && From == FromBB->getOffset() &&
|
||||
!FromBB->isEntryPoint() && !FromBB->isLandingPad()) {
|
||||
if (Trace.Branch != Trace::FT_ONLY && !BF.containsAddress(Trace.Branch) &&
|
||||
From == FromBB->getOffset() &&
|
||||
(IsReturn ? From : !(FromBB->isEntryPoint() || FromBB->isLandingPad()))) {
|
||||
const BinaryBasicBlock *PrevBB =
|
||||
BF.getLayout().getBlock(FromBB->getIndex() - 1);
|
||||
if (PrevBB->getSuccessor(FromBB->getLabel())) {
|
||||
@ -1228,12 +1223,14 @@ ErrorOr<Location> DataAggregator::parseLocationOrOffset() {
|
||||
std::error_code DataAggregator::parseAggregatedLBREntry() {
|
||||
enum AggregatedLBREntry : char {
|
||||
INVALID = 0,
|
||||
EVENT_NAME, // E
|
||||
TRACE, // T
|
||||
SAMPLE, // S
|
||||
BRANCH, // B
|
||||
FT, // F
|
||||
FT_EXTERNAL_ORIGIN // f
|
||||
EVENT_NAME, // E
|
||||
TRACE, // T
|
||||
RETURN, // R
|
||||
SAMPLE, // S
|
||||
BRANCH, // B
|
||||
FT, // F
|
||||
FT_EXTERNAL_ORIGIN, // f
|
||||
FT_EXTERNAL_RETURN // r
|
||||
} Type = INVALID;
|
||||
|
||||
/// The number of fields to parse, set based on \p Type.
|
||||
@ -1261,20 +1258,22 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
|
||||
|
||||
Type = StringSwitch<AggregatedLBREntry>(Str)
|
||||
.Case("T", TRACE)
|
||||
.Case("R", RETURN)
|
||||
.Case("S", SAMPLE)
|
||||
.Case("E", EVENT_NAME)
|
||||
.Case("B", BRANCH)
|
||||
.Case("F", FT)
|
||||
.Case("f", FT_EXTERNAL_ORIGIN)
|
||||
.Case("r", FT_EXTERNAL_RETURN)
|
||||
.Default(INVALID);
|
||||
|
||||
if (Type == INVALID) {
|
||||
reportError("expected T, S, E, B, F or f");
|
||||
reportError("expected T, R, S, E, B, F, f or r");
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
}
|
||||
|
||||
using SSI = StringSwitch<int>;
|
||||
AddrNum = SSI(Str).Case("T", 3).Case("S", 1).Case("E", 0).Default(2);
|
||||
AddrNum = SSI(Str).Cases("T", "R", 3).Case("S", 1).Case("E", 0).Default(2);
|
||||
CounterNum = SSI(Str).Case("B", 2).Case("E", 0).Default(1);
|
||||
}
|
||||
|
||||
@ -1331,17 +1330,30 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
|
||||
if (ToFunc)
|
||||
ToFunc->setHasProfileAvailable();
|
||||
|
||||
/// For legacy fall-through types, adjust locations to match Trace container.
|
||||
if (Type == FT || Type == FT_EXTERNAL_ORIGIN) {
|
||||
/// For fall-through types, adjust locations to match Trace container.
|
||||
if (Type == FT || Type == FT_EXTERNAL_ORIGIN || Type == FT_EXTERNAL_RETURN) {
|
||||
Addr[2] = Location(Addr[1]->Offset); // Trace To
|
||||
Addr[1] = Location(Addr[0]->Offset); // Trace From
|
||||
// Put a magic value into Trace Branch to differentiate from a full trace.
|
||||
Addr[0] = Location(Type == FT ? Trace::FT_ONLY : Trace::FT_EXTERNAL_ORIGIN);
|
||||
// Put a magic value into Trace Branch to differentiate from a full trace:
|
||||
if (Type == FT)
|
||||
Addr[0] = Location(Trace::FT_ONLY);
|
||||
else if (Type == FT_EXTERNAL_ORIGIN)
|
||||
Addr[0] = Location(Trace::FT_EXTERNAL_ORIGIN);
|
||||
else if (Type == FT_EXTERNAL_RETURN)
|
||||
Addr[0] = Location(Trace::FT_EXTERNAL_RETURN);
|
||||
else
|
||||
llvm_unreachable("Unexpected fall-through type");
|
||||
}
|
||||
|
||||
/// For legacy branch type, mark Trace To to differentite from a full trace.
|
||||
if (Type == BRANCH) {
|
||||
/// For branch type, mark Trace To to differentiate from a full trace.
|
||||
if (Type == BRANCH)
|
||||
Addr[2] = Location(Trace::BR_ONLY);
|
||||
|
||||
if (Type == RETURN) {
|
||||
if (!Addr[0]->Offset)
|
||||
Addr[0]->Offset = Trace::FT_EXTERNAL_RETURN;
|
||||
else
|
||||
Returns.emplace(Addr[0]->Offset);
|
||||
}
|
||||
|
||||
/// Record a trace.
|
||||
@ -1602,6 +1614,7 @@ void DataAggregator::processBranchEvents() {
|
||||
NamedRegionTimer T("processBranch", "Processing branch events",
|
||||
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
|
||||
|
||||
Returns.emplace(Trace::FT_EXTERNAL_RETURN);
|
||||
for (const auto &[Trace, Info] : Traces) {
|
||||
bool IsReturn = checkReturn(Trace.Branch);
|
||||
// Ignore returns.
|
||||
|
@ -10,6 +10,10 @@
|
||||
# RUN: link_fdata %s %t %t.pa-ret PREAGG-RET
|
||||
# Trace from an external location to a landing pad/entry point call continuation
|
||||
# RUN: link_fdata %s %t %t.pa-ext PREAGG-EXT
|
||||
# Return trace to a landing pad/entry point call continuation
|
||||
# RUN: link_fdata %s %t %t.pa-pret PREAGG-PRET
|
||||
# External return to a landing pad/entry point call continuation
|
||||
# RUN: link_fdata %s %t %t.pa-eret PREAGG-ERET
|
||||
# RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT
|
||||
|
||||
# RUN: llvm-strip --strip-unneeded %t -o %t.strip
|
||||
@ -38,6 +42,21 @@
|
||||
# RUN: llvm-bolt %t.strip --pa -p %t.pa-ext -o %t.out \
|
||||
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
|
||||
|
||||
## Check pre-aggregated return traces from external location attach call
|
||||
## continuation fallthrough count to secondary entry point (unstripped)
|
||||
# RUN: llvm-bolt %t --pa -p %t.pa-pret -o %t.out \
|
||||
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
|
||||
## Check pre-aggregated return traces from external location attach call
|
||||
## continuation fallthrough count to landing pad (stripped, landing pad)
|
||||
# RUN: llvm-bolt %t.strip --pa -p %t.pa-pret -o %t.out \
|
||||
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
|
||||
|
||||
## Same for external return type
|
||||
# RUN: llvm-bolt %t --pa -p %t.pa-eret -o %t.out \
|
||||
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
|
||||
# RUN: llvm-bolt %t.strip --pa -p %t.pa-eret -o %t.out \
|
||||
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
|
||||
|
||||
## Check pre-aggregated traces don't report zero-sized PLT fall-through as
|
||||
## invalid trace
|
||||
# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.pa-plt -o %t.out | FileCheck %s \
|
||||
@ -92,6 +111,10 @@ Ltmp4_br:
|
||||
# PREAGG-RET: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
|
||||
## Target is a secondary entry point (unstripped) or a landing pad (stripped)
|
||||
# PREAGG-EXT: T X:0 #Ltmp3# #Ltmp3_br# 1
|
||||
## Pre-aggregated return trace
|
||||
# PREAGG-PRET: R X:0 #Ltmp3# #Ltmp3_br# 1
|
||||
## External return
|
||||
# PREAGG-ERET: r #Ltmp3# #Ltmp3_br# 1
|
||||
|
||||
# CHECK-ATTACH: callq foo
|
||||
# CHECK-ATTACH-NEXT: count: 1
|
||||
|
@ -36,9 +36,9 @@ prefix_pat = re.compile(f"^# {args.prefix}: (.*)")
|
||||
fdata_pat = re.compile(r"([01].*) (?P<mispred>\d+) (?P<exec>\d+)")
|
||||
|
||||
# Pre-aggregated profile:
|
||||
# {T|S|E|B|F|f} <start> [<end>] [<ft_end>] <count> [<mispred_count>]
|
||||
# {T|R|S|E|B|F|f|r} <start> [<end>] [<ft_end>] <count> [<mispred_count>]
|
||||
# <loc>: [<id>:]<offset>
|
||||
preagg_pat = re.compile(r"(?P<type>[TSBFf]) (?P<offsets_count>.*)")
|
||||
preagg_pat = re.compile(r"(?P<type>[TRSBFfr]) (?P<offsets_count>.*)")
|
||||
|
||||
# No-LBR profile:
|
||||
# <is symbol?> <closest elf symbol or DSO name> <relative address> <count>
|
||||
|
Loading…
x
Reference in New Issue
Block a user