llvm-project/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
Fangrui Song 8f90a5cc45 [llvm-exegesis] Guard __builtin_thread_pointer use with __has_builtin
While Clang targets have supported __builtin_thread_pointer for a very
long time (e.g., 2007 for AArch32, 2015 for AArch64), for some GCC
ports, the support is very new (11.0 for x86[1], while we need to
support GCC 7), and many ports haven't implemented
__builtin_thread_pointer yet (m68k, powerpc, etc).

[1]: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96955
2023-07-17 16:42:11 -07:00

544 lines
20 KiB
C++

//===-- BenchmarkRunner.cpp -------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <array>
#include <memory>
#include <string>
#include "Assembler.h"
#include "BenchmarkRunner.h"
#include "Error.h"
#include "MCInstrDescView.h"
#include "PerfHelper.h"
#include "SubprocessMemory.h"
#include "Target.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Support/CrashRecoveryContext.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/Signals.h"
#ifdef __linux__
#ifdef HAVE_LIBPFM
#include <perfmon/perf_event.h>
#endif
#include <sys/mman.h>
#include <sys/ptrace.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <unistd.h>
#if defined(__GLIBC__) && __has_include(<sys/rseq.h>) && __has_builtin(__builtin_thread_pointer)
#include <sys/rseq.h>
#ifdef RSEQ_SIG
#define GLIBC_INITS_RSEQ
#endif
#endif
#endif
namespace llvm {
namespace exegesis {
BenchmarkRunner::BenchmarkRunner(const LLVMState &State, Benchmark::ModeE Mode,
BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
ExecutionModeE ExecutionMode)
: State(State), Mode(Mode), BenchmarkPhaseSelector(BenchmarkPhaseSelector),
ExecutionMode(ExecutionMode), Scratch(std::make_unique<ScratchSpace>()) {}
BenchmarkRunner::~BenchmarkRunner() = default;
void BenchmarkRunner::FunctionExecutor::accumulateCounterValues(
const llvm::SmallVectorImpl<int64_t> &NewValues,
llvm::SmallVectorImpl<int64_t> *Result) {
const size_t NumValues = std::max(NewValues.size(), Result->size());
if (NumValues > Result->size())
Result->resize(NumValues, 0);
for (size_t I = 0, End = NewValues.size(); I < End; ++I)
(*Result)[I] += NewValues[I];
}
Expected<llvm::SmallVector<int64_t, 4>>
BenchmarkRunner::FunctionExecutor::runAndSample(const char *Counters) const {
// We sum counts when there are several counters for a single ProcRes
// (e.g. P23 on SandyBridge).
llvm::SmallVector<int64_t, 4> CounterValues;
SmallVector<StringRef, 2> CounterNames;
StringRef(Counters).split(CounterNames, '+');
for (auto &CounterName : CounterNames) {
CounterName = CounterName.trim();
Expected<SmallVector<int64_t, 4>> ValueOrError =
runWithCounter(CounterName);
if (!ValueOrError)
return ValueOrError.takeError();
accumulateCounterValues(ValueOrError.get(), &CounterValues);
}
return CounterValues;
}
namespace {
class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
public:
InProcessFunctionExecutorImpl(const LLVMState &State,
object::OwningBinary<object::ObjectFile> Obj,
BenchmarkRunner::ScratchSpace *Scratch)
: State(State), Function(State.createTargetMachine(), std::move(Obj)),
Scratch(Scratch) {}
private:
static void
accumulateCounterValues(const llvm::SmallVector<int64_t, 4> &NewValues,
llvm::SmallVector<int64_t, 4> *Result) {
const size_t NumValues = std::max(NewValues.size(), Result->size());
if (NumValues > Result->size())
Result->resize(NumValues, 0);
for (size_t I = 0, End = NewValues.size(); I < End; ++I)
(*Result)[I] += NewValues[I];
}
Expected<llvm::SmallVector<int64_t, 4>>
runWithCounter(StringRef CounterName) const override {
const ExegesisTarget &ET = State.getExegesisTarget();
char *const ScratchPtr = Scratch->ptr();
auto CounterOrError = ET.createCounter(CounterName, State);
if (!CounterOrError)
return CounterOrError.takeError();
pfm::Counter *Counter = CounterOrError.get().get();
Scratch->clear();
{
auto PS = ET.withSavedState();
CrashRecoveryContext CRC;
CrashRecoveryContext::Enable();
const bool Crashed = !CRC.RunSafely([this, Counter, ScratchPtr]() {
Counter->start();
this->Function(ScratchPtr);
Counter->stop();
});
CrashRecoveryContext::Disable();
PS.reset();
if (Crashed) {
std::string Msg = "snippet crashed while running";
#ifdef LLVM_ON_UNIX
// See "Exit Status for Commands":
// https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xcu_chap02.html
constexpr const int kSigOffset = 128;
if (const char *const SigName = strsignal(CRC.RetCode - kSigOffset)) {
Msg += ": ";
Msg += SigName;
}
#endif
return make_error<SnippetCrash>(std::move(Msg));
}
}
return Counter->readOrError(Function.getFunctionBytes());
}
const LLVMState &State;
const ExecutableFunction Function;
BenchmarkRunner::ScratchSpace *const Scratch;
};
#ifdef __linux__
// The following class implements a function executor that executes the
// benchmark code within a subprocess rather than within the main llvm-exegesis
// process. This allows for much more control over the execution context of the
// snippet, particularly with regard to memory. This class performs all the
// necessary functions to create the subprocess, execute the snippet in the
// subprocess, and report results/handle errors.
class SubProcessFunctionExecutorImpl
: public BenchmarkRunner::FunctionExecutor {
public:
SubProcessFunctionExecutorImpl(const LLVMState &State,
object::OwningBinary<object::ObjectFile> Obj,
const BenchmarkKey &Key)
: State(State), Function(State.createTargetMachine(), std::move(Obj)),
Key(Key) {}
private:
enum ChildProcessExitCodeE {
CounterFDReadFailed = 1,
TranslatingCounterFDFailed,
RSeqDisableFailed,
FunctionDataMappingFailed,
AuxiliaryMemorySetupFailed
};
StringRef childProcessExitCodeToString(int ExitCode) const {
switch (ExitCode) {
case ChildProcessExitCodeE::CounterFDReadFailed:
return "Counter file descriptor read failed";
case ChildProcessExitCodeE::TranslatingCounterFDFailed:
return "Translating counter file descriptor into a file descriptor in "
"the child process failed. This might be due running an older "
"Linux kernel that doesn't support the pidfd_getfd system call "
"(anything before Linux 5.6).";
case ChildProcessExitCodeE::RSeqDisableFailed:
return "Disabling restartable sequences failed";
case ChildProcessExitCodeE::FunctionDataMappingFailed:
return "Failed to map memory for assembled snippet";
case ChildProcessExitCodeE::AuxiliaryMemorySetupFailed:
return "Failed to setup auxiliary memory";
default:
return "Child process returned with unknown exit code";
}
}
Error createSubProcessAndRunBenchmark(
StringRef CounterName, SmallVectorImpl<int64_t> &CounterValues) const {
int PipeFiles[2];
int PipeSuccessOrErr = pipe(PipeFiles);
if (PipeSuccessOrErr != 0) {
return make_error<Failure>(
"Failed to create a pipe for interprocess communication between "
"llvm-exegesis and the benchmarking subprocess");
}
SubprocessMemory SPMemory;
Error MemoryInitError = SPMemory.initializeSubprocessMemory(getpid());
if (MemoryInitError)
return MemoryInitError;
Error AddMemDefError =
SPMemory.addMemoryDefinition(Key.MemoryValues, getpid());
if (AddMemDefError)
return AddMemDefError;
pid_t ParentOrChildPID = fork();
if (ParentOrChildPID == 0) {
// We are in the child process, close the write end of the pipe
close(PipeFiles[1]);
// Unregister handlers, signal handling is now handled through ptrace in
// the host process
llvm::sys::unregisterHandlers();
prepareAndRunBenchmark(PipeFiles[0], Key);
// The child process terminates in the above function, so we should never
// get to this point.
llvm_unreachable("Child process didn't exit when expected.");
}
const ExegesisTarget &ET = State.getExegesisTarget();
auto CounterOrError =
ET.createCounter(CounterName, State, ParentOrChildPID);
if (!CounterOrError)
return CounterOrError.takeError();
pfm::Counter *Counter = CounterOrError.get().get();
close(PipeFiles[0]);
int CounterFileDescriptor = Counter->getFileDescriptor();
ssize_t BytesWritten =
write(PipeFiles[1], &CounterFileDescriptor, sizeof(int));
if (BytesWritten != sizeof(int))
return make_error<Failure>("Writing peformance counter file descriptor "
"to child process failed: " +
Twine(strerror(errno)));
if (ptrace(PTRACE_ATTACH, ParentOrChildPID, NULL, NULL) != 0)
return make_error<Failure>("Failed to attach to the child process: " +
Twine(strerror(errno)));
if (wait(NULL) == -1) {
return make_error<Failure>(
"Failed to wait for child process to stop after attaching: " +
Twine(strerror(errno)));
}
if (ptrace(PTRACE_CONT, ParentOrChildPID, NULL, NULL) != 0)
return make_error<Failure>(
"Failed to continue execution of the child process: " +
Twine(strerror(errno)));
int ChildStatus;
if (wait(&ChildStatus) == -1) {
return make_error<Failure>(
"Waiting for the child process to complete failed: " +
Twine(strerror(errno)));
}
if (WIFEXITED(ChildStatus)) {
int ChildExitCode = WEXITSTATUS(ChildStatus);
if (ChildExitCode == 0) {
// The child exited succesfully, read counter values and return
// success
CounterValues[0] = Counter->read();
return Error::success();
}
// The child exited, but not successfully
return make_error<SnippetCrash>(
"Child benchmarking process exited with non-zero exit code: " +
childProcessExitCodeToString(ChildExitCode));
}
// An error was encountered running the snippet, process it
siginfo_t ChildSignalInfo;
if (ptrace(PTRACE_GETSIGINFO, ParentOrChildPID, NULL, &ChildSignalInfo) ==
-1) {
return make_error<Failure>("Getting signal info from the child failed: " +
Twine(strerror(errno)));
}
return make_error<SnippetCrash>(
"The benchmarking subprocess sent unexpected signal: " +
Twine(strsignal(ChildSignalInfo.si_signo)));
}
[[noreturn]] void prepareAndRunBenchmark(int Pipe,
const BenchmarkKey &Key) const {
// The following occurs within the benchmarking subprocess
int ParentCounterFileDescriptor = -1;
ssize_t BytesRead = read(Pipe, &ParentCounterFileDescriptor, sizeof(int));
if (BytesRead != sizeof(int)) {
exit(ChildProcessExitCodeE::CounterFDReadFailed);
}
pid_t ParentPID = getppid();
// Make sure the following two syscalls are defined on the platform that
// we're building on as they were introduced to the kernel fairly recently
// (v5.6 for the second one).
#if defined(SYS_pidfd_open) && defined(SYS_pidfd_getfd)
int ParentPIDFD = syscall(SYS_pidfd_open, ParentPID, 0);
int CounterFileDescriptor =
syscall(SYS_pidfd_getfd, ParentPIDFD, ParentCounterFileDescriptor, 0);
#else
int CounterFileDescriptor = 0;
exit(ChildProcessExitCodeE::TranslatingCounterFDFailed);
#endif
if (CounterFileDescriptor == -1) {
exit(ChildProcessExitCodeE::TranslatingCounterFDFailed);
}
// Glibc versions greater than 2.35 automatically call rseq during
// initialization. Unmapping the region that glibc sets up for this causes
// segfaults in the program Unregister the rseq region so that we can safely
// unmap it later
#ifdef GLIBC_INITS_RSEQ
long RseqDisableOutput =
syscall(SYS_rseq, (intptr_t)__builtin_thread_pointer() + __rseq_offset,
__rseq_size, RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
if (RseqDisableOutput != 0)
exit(ChildProcessExitCodeE::RSeqDisableFailed);
#endif // GLIBC_INITS_RSEQ
size_t FunctionDataCopySize = this->Function.FunctionBytes.size();
char *FunctionDataCopy =
(char *)mmap(NULL, FunctionDataCopySize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
if ((intptr_t)FunctionDataCopy == -1)
exit(ChildProcessExitCodeE::FunctionDataMappingFailed);
memcpy(FunctionDataCopy, this->Function.FunctionBytes.data(),
this->Function.FunctionBytes.size());
mprotect(FunctionDataCopy, FunctionDataCopySize, PROT_READ | PROT_EXEC);
Expected<int> AuxMemFDOrError =
SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
Key.MemoryValues, ParentPID, CounterFileDescriptor);
if (!AuxMemFDOrError)
exit(ChildProcessExitCodeE::AuxiliaryMemorySetupFailed);
((void (*)(size_t, int))(intptr_t)FunctionDataCopy)(FunctionDataCopySize,
*AuxMemFDOrError);
exit(0);
}
Expected<llvm::SmallVector<int64_t, 4>>
runWithCounter(StringRef CounterName) const override {
SmallVector<int64_t, 4> Value(1, 0);
Error PossibleBenchmarkError =
createSubProcessAndRunBenchmark(CounterName, Value);
if (PossibleBenchmarkError) {
return std::move(PossibleBenchmarkError);
}
return Value;
}
const LLVMState &State;
const ExecutableFunction Function;
const BenchmarkKey &Key;
};
#endif // __linux__
} // namespace
Expected<SmallString<0>> BenchmarkRunner::assembleSnippet(
const BenchmarkCode &BC, const SnippetRepetitor &Repetitor,
unsigned MinInstructions, unsigned LoopBodySize,
bool GenerateMemoryInstructions) const {
const std::vector<MCInst> &Instructions = BC.Key.Instructions;
SmallString<0> Buffer;
raw_svector_ostream OS(Buffer);
if (Error E = assembleToStream(
State.getExegesisTarget(), State.createTargetMachine(), BC.LiveIns,
BC.Key.RegisterInitialValues,
Repetitor.Repeat(Instructions, MinInstructions, LoopBodySize,
GenerateMemoryInstructions),
OS, BC.Key, GenerateMemoryInstructions)) {
return std::move(E);
}
return Buffer;
}
Expected<BenchmarkRunner::RunnableConfiguration>
BenchmarkRunner::getRunnableConfiguration(
const BenchmarkCode &BC, unsigned NumRepetitions, unsigned LoopBodySize,
const SnippetRepetitor &Repetitor) const {
RunnableConfiguration RC;
Benchmark &InstrBenchmark = RC.InstrBenchmark;
InstrBenchmark.Mode = Mode;
InstrBenchmark.CpuName = std::string(State.getTargetMachine().getTargetCPU());
InstrBenchmark.LLVMTriple =
State.getTargetMachine().getTargetTriple().normalize();
InstrBenchmark.NumRepetitions = NumRepetitions;
InstrBenchmark.Info = BC.Info;
const std::vector<MCInst> &Instructions = BC.Key.Instructions;
bool GenerateMemoryInstructions = ExecutionMode == ExecutionModeE::SubProcess;
InstrBenchmark.Key = BC.Key;
// Assemble at least kMinInstructionsForSnippet instructions by repeating
// the snippet for debug/analysis. This is so that the user clearly
// understands that the inside instructions are repeated.
if (BenchmarkPhaseSelector > BenchmarkPhaseSelectorE::PrepareSnippet) {
const int MinInstructionsForSnippet = 4 * Instructions.size();
const int LoopBodySizeForSnippet = 2 * Instructions.size();
auto Snippet =
assembleSnippet(BC, Repetitor, MinInstructionsForSnippet,
LoopBodySizeForSnippet, GenerateMemoryInstructions);
if (Error E = Snippet.takeError())
return std::move(E);
if (auto Err = getBenchmarkFunctionBytes(*Snippet,
InstrBenchmark.AssembledSnippet))
return std::move(Err);
}
// Assemble NumRepetitions instructions repetitions of the snippet for
// measurements.
if (BenchmarkPhaseSelector >
BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
auto Snippet = assembleSnippet(BC, Repetitor, InstrBenchmark.NumRepetitions,
LoopBodySize, GenerateMemoryInstructions);
if (Error E = Snippet.takeError())
return std::move(E);
RC.ObjectFile = getObjectFromBuffer(*Snippet);
}
return std::move(RC);
}
Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>>
BenchmarkRunner::createFunctionExecutor(
object::OwningBinary<object::ObjectFile> ObjectFile,
const BenchmarkKey &Key) const {
switch (ExecutionMode) {
case ExecutionModeE::InProcess:
return std::make_unique<InProcessFunctionExecutorImpl>(
State, std::move(ObjectFile), Scratch.get());
case ExecutionModeE::SubProcess:
#ifdef __linux__
return std::make_unique<SubProcessFunctionExecutorImpl>(
State, std::move(ObjectFile), Key);
#else
return make_error<Failure>(
"The subprocess execution mode is only supported on Linux");
#endif
}
llvm_unreachable("ExecutionMode is outside expected range");
}
Expected<Benchmark> BenchmarkRunner::runConfiguration(
RunnableConfiguration &&RC,
const std::optional<StringRef> &DumpFile) const {
Benchmark &InstrBenchmark = RC.InstrBenchmark;
object::OwningBinary<object::ObjectFile> &ObjectFile = RC.ObjectFile;
if (DumpFile && BenchmarkPhaseSelector >
BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
auto ObjectFilePath =
writeObjectFile(ObjectFile.getBinary()->getData(), *DumpFile);
if (Error E = ObjectFilePath.takeError()) {
InstrBenchmark.Error = toString(std::move(E));
return std::move(InstrBenchmark);
}
outs() << "Check generated assembly with: /usr/bin/objdump -d "
<< *ObjectFilePath << "\n";
}
if (BenchmarkPhaseSelector < BenchmarkPhaseSelectorE::Measure) {
InstrBenchmark.Error = "actual measurements skipped.";
return std::move(InstrBenchmark);
}
Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>> Executor =
createFunctionExecutor(std::move(ObjectFile), RC.InstrBenchmark.Key);
if (!Executor)
return Executor.takeError();
auto NewMeasurements = runMeasurements(**Executor);
if (Error E = NewMeasurements.takeError()) {
if (!E.isA<SnippetCrash>())
return std::move(E);
InstrBenchmark.Error = toString(std::move(E));
return std::move(InstrBenchmark);
}
assert(InstrBenchmark.NumRepetitions > 0 && "invalid NumRepetitions");
for (BenchmarkMeasure &BM : *NewMeasurements) {
// Scale the measurements by instruction.
BM.PerInstructionValue /= InstrBenchmark.NumRepetitions;
// Scale the measurements by snippet.
BM.PerSnippetValue *=
static_cast<double>(InstrBenchmark.Key.Instructions.size()) /
InstrBenchmark.NumRepetitions;
}
InstrBenchmark.Measurements = std::move(*NewMeasurements);
return std::move(InstrBenchmark);
}
Expected<std::string>
BenchmarkRunner::writeObjectFile(StringRef Buffer, StringRef FileName) const {
int ResultFD = 0;
SmallString<256> ResultPath = FileName;
if (Error E = errorCodeToError(
FileName.empty() ? sys::fs::createTemporaryFile("snippet", "o",
ResultFD, ResultPath)
: sys::fs::openFileForReadWrite(
FileName, ResultFD, sys::fs::CD_CreateAlways,
sys::fs::OF_None)))
return std::move(E);
raw_fd_ostream OFS(ResultFD, true /*ShouldClose*/);
OFS.write(Buffer.data(), Buffer.size());
OFS.flush();
return std::string(ResultPath.str());
}
BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {}
} // namespace exegesis
} // namespace llvm