[BOLT][AArch64] Add support for SPE brstack format (#129231)
Since Linux 6.14, Perf gained the ability to report SPE branch events using the `brstack` format, which matches the layout of LBR/BRBE. This patch reuses the existing LBR parsing logic to support SPE. Example SPE brstack format: ```bash perf script -i perf.data -F pid,brstack --itrace=bl ``` ``` PID FROM / TO / PREDICTED 16984 0x72e342e5f4/0x72e36192d0/M/-/-/11/RET/- 16984 0x72e7b8b3b4/0x72e7b8b3b8/PN/-/-/11/COND/- 16984 0x72e7b92b48/0x72e7b92b4c/PN/-/-/8/COND/- 16984 0x72eacc6b7c/0x760cc94b00/P/-/-/9/RET/- 16984 0x72e3f210fc/0x72e3f21068/P/-/-/4//- 16984 0x72e39b8c5c/0x72e3627b24/P/-/-/4//- 16984 0x72e7b89d20/0x72e7b92bbc/P/-/-/4/RET/- ``` SPE brstack flags can be two characters long: `PN` or `MN`: - `P` = predicted branch - `M` = mispredicted branch - `N` = optionally appears when the branch is NOT-TAKEN - flag is relevant only to conditional branches Example of usage with BOLT: 1. Capture SPE branch events: ```bash perf record -e 'arm_spe_0/branch_filter=1/u' -- binary ``` 2. Convert profile for BOLT: ```bash perf2bolt -p perf.data -o perf.fdata --spe binary ``` 3. Run BOLT Optimization: ```bash llvm-bolt binary -o binary.bolted --data perf.fdata ... ``` A unit test verifies the parsing of the 'SPE brstack format'. --------- Co-authored-by: Paschalis Mpeis <paschalis.mpeis@arm.com>
This commit is contained in:
parent
dd4776d429
commit
f75973949b
@ -85,6 +85,8 @@ private:
|
||||
};
|
||||
friend raw_ostream &operator<<(raw_ostream &OS, const LBREntry &);
|
||||
|
||||
friend struct PerfSpeEventsTestHelper;
|
||||
|
||||
struct PerfBranchSample {
|
||||
SmallVector<LBREntry, 32> LBR;
|
||||
};
|
||||
|
@ -48,6 +48,7 @@ extern llvm::cl::OptionCategory BinaryAnalysisCategory;
|
||||
extern llvm::cl::opt<unsigned> AlignText;
|
||||
extern llvm::cl::opt<unsigned> AlignFunctions;
|
||||
extern llvm::cl::opt<bool> AggregateOnly;
|
||||
extern llvm::cl::opt<bool> ArmSPE;
|
||||
extern llvm::cl::opt<unsigned> BucketsPerLine;
|
||||
extern llvm::cl::opt<bool> CompactCodeModel;
|
||||
extern llvm::cl::opt<bool> DiffOnly;
|
||||
|
@ -49,6 +49,9 @@ static cl::opt<bool>
|
||||
cl::desc("aggregate basic samples (without LBR info)"),
|
||||
cl::cat(AggregatorCategory));
|
||||
|
||||
cl::opt<bool> ArmSPE("spe", cl::desc("Enable Arm SPE mode."),
|
||||
cl::cat(AggregatorCategory));
|
||||
|
||||
static cl::opt<std::string>
|
||||
ITraceAggregation("itrace",
|
||||
cl::desc("Generate LBR info with perf itrace argument"),
|
||||
@ -181,11 +184,21 @@ void DataAggregator::start() {
|
||||
|
||||
findPerfExecutable();
|
||||
|
||||
if (opts::ArmSPE) {
|
||||
// pid from_ip to_ip flags
|
||||
// where flags could be:
|
||||
// P/M: whether branch was Predicted or Mispredicted.
|
||||
// N: optionally appears when the branch was Not-Taken (ie fall-through)
|
||||
// 12345 0x123/0x456/PN/-/-/8/RET/-
|
||||
opts::ITraceAggregation = "bl";
|
||||
opts::ParseMemProfile = true;
|
||||
opts::BasicAggregation = false;
|
||||
}
|
||||
|
||||
if (opts::BasicAggregation) {
|
||||
launchPerfProcess("events without LBR",
|
||||
MainEventsPPI,
|
||||
launchPerfProcess("events without LBR", MainEventsPPI,
|
||||
"script -F pid,event,ip",
|
||||
/*Wait = */false);
|
||||
/*Wait = */ false);
|
||||
} else if (!opts::ITraceAggregation.empty()) {
|
||||
// Disable parsing memory profile from trace data, unless requested by user.
|
||||
if (!opts::ParseMemProfile.getNumOccurrences())
|
||||
@ -994,9 +1007,22 @@ ErrorOr<DataAggregator::LBREntry> DataAggregator::parseLBREntry() {
|
||||
if (std::error_code EC = MispredStrRes.getError())
|
||||
return EC;
|
||||
StringRef MispredStr = MispredStrRes.get();
|
||||
if (MispredStr.size() != 1 ||
|
||||
(MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-')) {
|
||||
reportError("expected single char for mispred bit");
|
||||
// SPE brstack mispredicted flags might be up to two characters long:
|
||||
// 'PN' or 'MN'. Where 'N' optionally appears.
|
||||
bool ValidStrSize = opts::ArmSPE
|
||||
? MispredStr.size() >= 1 && MispredStr.size() <= 2
|
||||
: MispredStr.size() == 1;
|
||||
bool SpeTakenBitErr =
|
||||
(opts::ArmSPE && MispredStr.size() == 2 && MispredStr[1] != 'N');
|
||||
bool PredictionBitErr =
|
||||
!ValidStrSize ||
|
||||
(MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-');
|
||||
if (SpeTakenBitErr)
|
||||
reportError("expected 'N' as SPE prediction bit for a not-taken branch");
|
||||
if (PredictionBitErr)
|
||||
reportError("expected 'P', 'M' or '-' char as a prediction bit");
|
||||
|
||||
if (SpeTakenBitErr || PredictionBitErr) {
|
||||
Diag << "Found: " << MispredStr << "\n";
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
}
|
||||
@ -1497,7 +1523,9 @@ void DataAggregator::printBranchStacksDiagnostics(
|
||||
}
|
||||
|
||||
std::error_code DataAggregator::parseBranchEvents() {
|
||||
outs() << "PERF2BOLT: parse branch events...\n";
|
||||
std::string BranchEventTypeStr =
|
||||
opts::ArmSPE ? "SPE branch events in LBR-format" : "branch events";
|
||||
outs() << "PERF2BOLT: parse " << BranchEventTypeStr << "...\n";
|
||||
NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
|
||||
TimerGroupDesc, opts::TimeAggregator);
|
||||
|
||||
@ -1525,7 +1553,8 @@ std::error_code DataAggregator::parseBranchEvents() {
|
||||
}
|
||||
|
||||
NumEntries += Sample.LBR.size();
|
||||
if (BAT && Sample.LBR.size() == 32 && !NeedsSkylakeFix) {
|
||||
if (this->BC->isX86() && BAT && Sample.LBR.size() == 32 &&
|
||||
!NeedsSkylakeFix) {
|
||||
errs() << "PERF2BOLT-WARNING: using Intel Skylake bug workaround\n";
|
||||
NeedsSkylakeFix = true;
|
||||
}
|
||||
@ -1548,10 +1577,18 @@ std::error_code DataAggregator::parseBranchEvents() {
|
||||
if (NumSamples && NumSamplesNoLBR == NumSamples) {
|
||||
// Note: we don't know if perf2bolt is being used to parse memory samples
|
||||
// at this point. In this case, it is OK to parse zero LBRs.
|
||||
errs() << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
|
||||
"LBR. Record profile with perf record -j any or run perf2bolt "
|
||||
"in no-LBR mode with -nl (the performance improvement in -nl "
|
||||
"mode may be limited)\n";
|
||||
if (!opts::ArmSPE)
|
||||
errs()
|
||||
<< "PERF2BOLT-WARNING: all recorded samples for this binary lack "
|
||||
"LBR. Record profile with perf record -j any or run perf2bolt "
|
||||
"in no-LBR mode with -nl (the performance improvement in -nl "
|
||||
"mode may be limited)\n";
|
||||
else
|
||||
errs()
|
||||
<< "PERF2BOLT-WARNING: All recorded samples for this binary lack "
|
||||
"SPE brstack entries. Make sure you are running Linux perf 6.14 "
|
||||
"or later, otherwise you get zero samples. Record the profile "
|
||||
"with: perf record -e 'arm_spe_0/branch_filter=1/'.";
|
||||
} else {
|
||||
printBranchStacksDiagnostics(NumTotalSamples - NumSamples);
|
||||
}
|
||||
|
12
bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
Normal file
12
bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
Normal file
@ -0,0 +1,12 @@
|
||||
## Check that Arm SPE mode is available on AArch64.
|
||||
|
||||
REQUIRES: system-linux,perf,target=aarch64{{.*}}
|
||||
|
||||
RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
|
||||
|
||||
RUN: perf record -e cycles -q -o %t.perf.data -- %t.exe 2> /dev/null
|
||||
|
||||
RUN: (perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe 2> /dev/null; exit 0) | FileCheck %s --check-prefix=CHECK-SPE-LBR
|
||||
|
||||
CHECK-SPE-LBR: PERF2BOLT: parse SPE branch events in LBR-format
|
||||
|
9
bolt/test/perf2bolt/X86/perf2bolt-spe.test
Normal file
9
bolt/test/perf2bolt/X86/perf2bolt-spe.test
Normal file
@ -0,0 +1,9 @@
|
||||
## Check that Arm SPE mode is unavailable on X86.
|
||||
|
||||
REQUIRES: system-linux,x86_64-linux
|
||||
|
||||
RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
|
||||
RUN: touch %t.empty.perf.data
|
||||
RUN: not perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --spe --pa %t.exe 2>&1 | FileCheck %s
|
||||
|
||||
CHECK: perf2bolt: -spe is available only on AArch64.
|
@ -237,6 +237,13 @@ int main(int argc, char **argv) {
|
||||
if (Error E = RIOrErr.takeError())
|
||||
report_error(opts::InputFilename, std::move(E));
|
||||
RewriteInstance &RI = *RIOrErr.get();
|
||||
|
||||
if (opts::AggregateOnly && !RI.getBinaryContext().isAArch64() &&
|
||||
opts::ArmSPE) {
|
||||
errs() << ToolName << ": -spe is available only on AArch64.\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (!opts::PerfData.empty()) {
|
||||
if (!opts::AggregateOnly) {
|
||||
errs() << ToolName
|
||||
|
@ -1,11 +1,25 @@
|
||||
set(LLVM_LINK_COMPONENTS
|
||||
DebugInfoDWARF
|
||||
Object
|
||||
${LLVM_TARGETS_TO_BUILD}
|
||||
)
|
||||
|
||||
add_bolt_unittest(ProfileTests
|
||||
DataAggregator.cpp
|
||||
PerfSpeEvents.cpp
|
||||
|
||||
DISABLE_LLVM_LINK_LLVM_DYLIB
|
||||
)
|
||||
|
||||
target_link_libraries(ProfileTests
|
||||
PRIVATE
|
||||
LLVMBOLTCore
|
||||
LLVMBOLTProfile
|
||||
LLVMTargetParser
|
||||
LLVMTestingSupport
|
||||
)
|
||||
|
||||
foreach (tgt ${BOLT_TARGETS_TO_BUILD})
|
||||
string(TOUPPER "${tgt}" upper)
|
||||
target_compile_definitions(ProfileTests PRIVATE "${upper}_AVAILABLE")
|
||||
endforeach()
|
||||
|
164
bolt/unittests/Profile/PerfSpeEvents.cpp
Normal file
164
bolt/unittests/Profile/PerfSpeEvents.cpp
Normal file
@ -0,0 +1,164 @@
|
||||
//===- bolt/unittests/Profile/PerfSpeEvents.cpp ---------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifdef AARCH64_AVAILABLE
|
||||
|
||||
#include "bolt/Core/BinaryContext.h"
|
||||
#include "bolt/Profile/DataAggregator.h"
|
||||
#include "llvm/BinaryFormat/ELF.h"
|
||||
#include "llvm/DebugInfo/DWARF/DWARFContext.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace llvm;
|
||||
using namespace llvm::bolt;
|
||||
using namespace llvm::object;
|
||||
using namespace llvm::ELF;
|
||||
|
||||
namespace opts {
|
||||
extern cl::opt<std::string> ReadPerfEvents;
|
||||
extern cl::opt<bool> ArmSPE;
|
||||
} // namespace opts
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
/// Perform checks on perf SPE branch events.
|
||||
struct PerfSpeEventsTestHelper : public testing::Test {
|
||||
void SetUp() override {
|
||||
initalizeLLVM();
|
||||
prepareElf();
|
||||
initializeBOLT();
|
||||
}
|
||||
|
||||
protected:
|
||||
using Trace = DataAggregator::Trace;
|
||||
using TakenBranchInfo = DataAggregator::TakenBranchInfo;
|
||||
|
||||
void initalizeLLVM() {
|
||||
llvm::InitializeAllTargetInfos();
|
||||
llvm::InitializeAllTargetMCs();
|
||||
llvm::InitializeAllAsmParsers();
|
||||
llvm::InitializeAllDisassemblers();
|
||||
llvm::InitializeAllTargets();
|
||||
llvm::InitializeAllAsmPrinters();
|
||||
}
|
||||
|
||||
void prepareElf() {
|
||||
memcpy(ElfBuf, "\177ELF", 4);
|
||||
ELF64LE::Ehdr *EHdr = reinterpret_cast<typename ELF64LE::Ehdr *>(ElfBuf);
|
||||
EHdr->e_ident[llvm::ELF::EI_CLASS] = llvm::ELF::ELFCLASS64;
|
||||
EHdr->e_ident[llvm::ELF::EI_DATA] = llvm::ELF::ELFDATA2LSB;
|
||||
EHdr->e_machine = llvm::ELF::EM_AARCH64;
|
||||
MemoryBufferRef Source(StringRef(ElfBuf, sizeof(ElfBuf)), "ELF");
|
||||
ObjFile = cantFail(ObjectFile::createObjectFile(Source));
|
||||
}
|
||||
|
||||
void initializeBOLT() {
|
||||
Relocation::Arch = ObjFile->makeTriple().getArch();
|
||||
BC = cantFail(BinaryContext::createBinaryContext(
|
||||
ObjFile->makeTriple(), std::make_shared<orc::SymbolStringPool>(),
|
||||
ObjFile->getFileName(), nullptr, /*IsPIC*/ false,
|
||||
DWARFContext::create(*ObjFile.get()), {llvm::outs(), llvm::errs()}));
|
||||
ASSERT_FALSE(!BC);
|
||||
}
|
||||
|
||||
char ElfBuf[sizeof(typename ELF64LE::Ehdr)] = {};
|
||||
std::unique_ptr<ObjectFile> ObjFile;
|
||||
std::unique_ptr<BinaryContext> BC;
|
||||
|
||||
/// Helper function to export lists to show the mismatch.
|
||||
void reportBrStackEventMismatch(
|
||||
const std::vector<std::pair<Trace, TakenBranchInfo>> &Traces,
|
||||
const std::vector<std::pair<Trace, TakenBranchInfo>> &ExpectedSamples) {
|
||||
llvm::errs() << "Traces items: \n";
|
||||
for (const auto &[Trace, BI] : Traces)
|
||||
llvm::errs() << "{" << Trace.Branch << ", " << Trace.From << ","
|
||||
<< Trace.To << ", " << BI.TakenCount << ", "
|
||||
<< BI.MispredCount << "}" << "\n";
|
||||
|
||||
llvm::errs() << "Expected items: \n";
|
||||
for (const auto &[Trace, BI] : ExpectedSamples)
|
||||
llvm::errs() << "{" << Trace.Branch << ", " << Trace.From << ", "
|
||||
<< Trace.To << ", " << BI.TakenCount << ", "
|
||||
<< BI.MispredCount << "}" << "\n";
|
||||
}
|
||||
|
||||
/// Parse and check SPE brstack as LBR.
|
||||
void parseAndCheckBrstackEvents(
|
||||
uint64_t PID,
|
||||
const std::vector<std::pair<Trace, TakenBranchInfo>> &ExpectedSamples) {
|
||||
DataAggregator DA("<pseudo input>");
|
||||
DA.ParsingBuf = opts::ReadPerfEvents;
|
||||
DA.BC = BC.get();
|
||||
DataAggregator::MMapInfo MMap;
|
||||
DA.BinaryMMapInfo.insert(std::make_pair(PID, MMap));
|
||||
|
||||
DA.parseBranchEvents();
|
||||
|
||||
EXPECT_EQ(DA.Traces.size(), ExpectedSamples.size());
|
||||
if (DA.Traces.size() != ExpectedSamples.size())
|
||||
reportBrStackEventMismatch(DA.Traces, ExpectedSamples);
|
||||
|
||||
const auto TracesBegin = DA.Traces.begin();
|
||||
const auto TracesEnd = DA.Traces.end();
|
||||
for (const auto &BI : ExpectedSamples) {
|
||||
auto it = find_if(TracesBegin, TracesEnd,
|
||||
[&BI](const auto &Tr) { return Tr.first == BI.first; });
|
||||
|
||||
EXPECT_NE(it, TracesEnd);
|
||||
EXPECT_EQ(it->second.MispredCount, BI.second.MispredCount);
|
||||
EXPECT_EQ(it->second.TakenCount, BI.second.TakenCount);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
TEST_F(PerfSpeEventsTestHelper, SpeBranchesWithBrstack) {
|
||||
// Check perf input with SPE branch events as brstack format.
|
||||
// Example collection command:
|
||||
// ```
|
||||
// perf record -e 'arm_spe_0/branch_filter=1/u' -- BINARY
|
||||
// ```
|
||||
// How Bolt extracts the branch events:
|
||||
// ```
|
||||
// perf script -F pid,brstack --itrace=bl
|
||||
// ```
|
||||
|
||||
opts::ArmSPE = true;
|
||||
opts::ReadPerfEvents = " 1234 0xa001/0xa002/PN/-/-/10/COND/-\n"
|
||||
" 1234 0xb001/0xb002/P/-/-/4/RET/-\n"
|
||||
" 1234 0xc456/0xc789/P/-/-/13/-/-\n"
|
||||
" 1234 0xd123/0xd456/M/-/-/7/RET/-\n"
|
||||
" 1234 0xe001/0xe002/P/-/-/14/RET/-\n"
|
||||
" 1234 0xd123/0xd456/M/-/-/7/RET/-\n"
|
||||
" 1234 0xf001/0xf002/MN/-/-/8/COND/-\n"
|
||||
" 1234 0xc456/0xc789/M/-/-/13/-/-\n";
|
||||
|
||||
// ExpectedSamples contains the aggregated information about
|
||||
// a branch {{Branch From, To}, {TakenCount, MispredCount}}.
|
||||
// Consider this example trace: {{0xd123, 0xd456, Trace::BR_ONLY},
|
||||
// {2,2}}. This entry has a TakenCount = 2, as we have two samples for
|
||||
// (0xd123, 0xd456) in our input. It also has MispredsCount = 2,
|
||||
// as 'M' misprediction flag appears in both cases. BR_ONLY means
|
||||
// the trace only contains branch data.
|
||||
std::vector<std::pair<Trace, TakenBranchInfo>> ExpectedSamples = {
|
||||
{{0xa001, 0xa002, Trace::BR_ONLY}, {1, 0}},
|
||||
{{0xb001, 0xb002, Trace::BR_ONLY}, {1, 0}},
|
||||
{{0xc456, 0xc789, Trace::BR_ONLY}, {2, 1}},
|
||||
{{0xd123, 0xd456, Trace::BR_ONLY}, {2, 2}},
|
||||
{{0xe001, 0xe002, Trace::BR_ONLY}, {1, 0}},
|
||||
{{0xf001, 0xf002, Trace::BR_ONLY}, {1, 1}}};
|
||||
|
||||
parseAndCheckBrstackEvents(1234, ExpectedSamples);
|
||||
}
|
||||
|
||||
#endif
|
Loading…
x
Reference in New Issue
Block a user