llvm-project/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
Aiden Grossman ceb196d990
[llvm-exegesis] Validate that address annotations are aligned (#75554)
This patch adds in validation at two different levels that address
annotations are page aligned. This is necessary as otherwise the mmap
calls will fail as MAP_FIXED/MAP_FIXED_NOREPLACE require page aligned
addresses. This happens silently in the subprocess. This patch adds
validation at snippet parsing time to give feedback to the user and also
adds asserts at code generation/address usage time to ensure that other
users of the Exegesis APIs conform to the same requirements.
2023-12-15 09:45:30 -08:00

643 lines
23 KiB
C++

//===-- BenchmarkRunner.cpp -------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <memory>
#include <string>
#include "Assembler.h"
#include "BenchmarkRunner.h"
#include "Error.h"
#include "MCInstrDescView.h"
#include "MmapUtils.h"
#include "PerfHelper.h"
#include "SubprocessMemory.h"
#include "Target.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Support/CrashRecoveryContext.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/SystemZ/zOSSupport.h"
#ifdef __linux__
#ifdef HAVE_LIBPFM
#include <perfmon/perf_event.h>
#endif
#include <sys/mman.h>
#include <sys/ptrace.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <unistd.h>
#if defined(__GLIBC__) && __has_include(<sys/rseq.h>) && defined(HAVE_BUILTIN_THREAD_POINTER)
#include <sys/rseq.h>
#if defined(RSEQ_SIG) && defined(SYS_rseq)
#define GLIBC_INITS_RSEQ
#endif
#endif
#endif // __linux__
namespace llvm {
namespace exegesis {
BenchmarkRunner::BenchmarkRunner(const LLVMState &State, Benchmark::ModeE Mode,
BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
ExecutionModeE ExecutionMode)
: State(State), Mode(Mode), BenchmarkPhaseSelector(BenchmarkPhaseSelector),
ExecutionMode(ExecutionMode), Scratch(std::make_unique<ScratchSpace>()) {}
BenchmarkRunner::~BenchmarkRunner() = default;
void BenchmarkRunner::FunctionExecutor::accumulateCounterValues(
const llvm::SmallVectorImpl<int64_t> &NewValues,
llvm::SmallVectorImpl<int64_t> *Result) {
const size_t NumValues = std::max(NewValues.size(), Result->size());
if (NumValues > Result->size())
Result->resize(NumValues, 0);
for (size_t I = 0, End = NewValues.size(); I < End; ++I)
(*Result)[I] += NewValues[I];
}
Expected<llvm::SmallVector<int64_t, 4>>
BenchmarkRunner::FunctionExecutor::runAndSample(const char *Counters) const {
// We sum counts when there are several counters for a single ProcRes
// (e.g. P23 on SandyBridge).
llvm::SmallVector<int64_t, 4> CounterValues;
SmallVector<StringRef, 2> CounterNames;
StringRef(Counters).split(CounterNames, '+');
for (auto &CounterName : CounterNames) {
CounterName = CounterName.trim();
Expected<SmallVector<int64_t, 4>> ValueOrError =
runWithCounter(CounterName);
if (!ValueOrError)
return ValueOrError.takeError();
accumulateCounterValues(ValueOrError.get(), &CounterValues);
}
return CounterValues;
}
namespace {
class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
public:
static Expected<std::unique_ptr<InProcessFunctionExecutorImpl>>
create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj,
BenchmarkRunner::ScratchSpace *Scratch) {
Expected<ExecutableFunction> EF =
ExecutableFunction::create(State.createTargetMachine(), std::move(Obj));
if (!EF)
return EF.takeError();
return std::unique_ptr<InProcessFunctionExecutorImpl>(
new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch));
}
private:
InProcessFunctionExecutorImpl(const LLVMState &State,
ExecutableFunction Function,
BenchmarkRunner::ScratchSpace *Scratch)
: State(State), Function(std::move(Function)), Scratch(Scratch) {}
static void
accumulateCounterValues(const llvm::SmallVector<int64_t, 4> &NewValues,
llvm::SmallVector<int64_t, 4> *Result) {
const size_t NumValues = std::max(NewValues.size(), Result->size());
if (NumValues > Result->size())
Result->resize(NumValues, 0);
for (size_t I = 0, End = NewValues.size(); I < End; ++I)
(*Result)[I] += NewValues[I];
}
Expected<llvm::SmallVector<int64_t, 4>>
runWithCounter(StringRef CounterName) const override {
const ExegesisTarget &ET = State.getExegesisTarget();
char *const ScratchPtr = Scratch->ptr();
auto CounterOrError = ET.createCounter(CounterName, State);
if (!CounterOrError)
return CounterOrError.takeError();
pfm::Counter *Counter = CounterOrError.get().get();
Scratch->clear();
{
auto PS = ET.withSavedState();
CrashRecoveryContext CRC;
CrashRecoveryContext::Enable();
const bool Crashed = !CRC.RunSafely([this, Counter, ScratchPtr]() {
Counter->start();
this->Function(ScratchPtr);
Counter->stop();
});
CrashRecoveryContext::Disable();
PS.reset();
if (Crashed) {
#ifdef LLVM_ON_UNIX
// See "Exit Status for Commands":
// https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xcu_chap02.html
constexpr const int kSigOffset = 128;
return make_error<SnippetSignal>(CRC.RetCode - kSigOffset);
#else
// The exit code of the process on windows is not meaningful as a
// signal, so simply pass in -1 as the signal into the error.
return make_error<SnippetSignal>(-1);
#endif // LLVM_ON_UNIX
}
}
return Counter->readOrError(Function.getFunctionBytes());
}
const LLVMState &State;
const ExecutableFunction Function;
BenchmarkRunner::ScratchSpace *const Scratch;
};
#ifdef __linux__
// The following class implements a function executor that executes the
// benchmark code within a subprocess rather than within the main llvm-exegesis
// process. This allows for much more control over the execution context of the
// snippet, particularly with regard to memory. This class performs all the
// necessary functions to create the subprocess, execute the snippet in the
// subprocess, and report results/handle errors.
class SubProcessFunctionExecutorImpl
: public BenchmarkRunner::FunctionExecutor {
public:
static Expected<std::unique_ptr<SubProcessFunctionExecutorImpl>>
create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj,
const BenchmarkKey &Key) {
Expected<ExecutableFunction> EF =
ExecutableFunction::create(State.createTargetMachine(), std::move(Obj));
if (!EF)
return EF.takeError();
return std::unique_ptr<SubProcessFunctionExecutorImpl>(
new SubProcessFunctionExecutorImpl(State, std::move(*EF), Key));
}
private:
SubProcessFunctionExecutorImpl(const LLVMState &State,
ExecutableFunction Function,
const BenchmarkKey &Key)
: State(State), Function(std::move(Function)), Key(Key) {}
enum ChildProcessExitCodeE {
CounterFDReadFailed = 1,
RSeqDisableFailed,
FunctionDataMappingFailed,
AuxiliaryMemorySetupFailed
};
StringRef childProcessExitCodeToString(int ExitCode) const {
switch (ExitCode) {
case ChildProcessExitCodeE::CounterFDReadFailed:
return "Counter file descriptor read failed";
case ChildProcessExitCodeE::RSeqDisableFailed:
return "Disabling restartable sequences failed";
case ChildProcessExitCodeE::FunctionDataMappingFailed:
return "Failed to map memory for assembled snippet";
case ChildProcessExitCodeE::AuxiliaryMemorySetupFailed:
return "Failed to setup auxiliary memory";
default:
return "Child process returned with unknown exit code";
}
}
Error sendFileDescriptorThroughSocket(int SocketFD, int FD) const {
struct msghdr Message = {};
char Buffer[CMSG_SPACE(sizeof(FD))];
memset(Buffer, 0, sizeof(Buffer));
Message.msg_control = Buffer;
Message.msg_controllen = sizeof(Buffer);
struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message);
ControlMessage->cmsg_level = SOL_SOCKET;
ControlMessage->cmsg_type = SCM_RIGHTS;
ControlMessage->cmsg_len = CMSG_LEN(sizeof(FD));
memcpy(CMSG_DATA(ControlMessage), &FD, sizeof(FD));
Message.msg_controllen = CMSG_SPACE(sizeof(FD));
ssize_t BytesWritten = sendmsg(SocketFD, &Message, 0);
if (BytesWritten < 0)
return make_error<Failure>("Failed to write FD to socket: " +
Twine(strerror(errno)));
return Error::success();
}
Expected<int> getFileDescriptorFromSocket(int SocketFD) const {
struct msghdr Message = {};
char ControlBuffer[256];
Message.msg_control = ControlBuffer;
Message.msg_controllen = sizeof(ControlBuffer);
ssize_t BytesRead = recvmsg(SocketFD, &Message, 0);
if (BytesRead < 0)
return make_error<Failure>("Failed to read FD from socket: " +
Twine(strerror(errno)));
struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message);
int FD;
if (ControlMessage->cmsg_len != CMSG_LEN(sizeof(FD)))
return make_error<Failure>("Failed to get correct number of bytes for "
"file descriptor from socket.");
memcpy(&FD, CMSG_DATA(ControlMessage), sizeof(FD));
return FD;
}
Error createSubProcessAndRunBenchmark(
StringRef CounterName, SmallVectorImpl<int64_t> &CounterValues) const {
int PipeFiles[2];
int PipeSuccessOrErr = socketpair(AF_UNIX, SOCK_DGRAM, 0, PipeFiles);
if (PipeSuccessOrErr != 0) {
return make_error<Failure>(
"Failed to create a pipe for interprocess communication between "
"llvm-exegesis and the benchmarking subprocess: " +
Twine(strerror(errno)));
}
SubprocessMemory SPMemory;
Error MemoryInitError = SPMemory.initializeSubprocessMemory(getpid());
if (MemoryInitError)
return MemoryInitError;
Error AddMemDefError =
SPMemory.addMemoryDefinition(Key.MemoryValues, getpid());
if (AddMemDefError)
return AddMemDefError;
pid_t ParentOrChildPID = fork();
if (ParentOrChildPID == -1) {
return make_error<Failure>("Failed to create child process: " +
Twine(strerror(errno)));
}
if (ParentOrChildPID == 0) {
// We are in the child process, close the write end of the pipe
close(PipeFiles[1]);
// Unregister handlers, signal handling is now handled through ptrace in
// the host process
llvm::sys::unregisterHandlers();
prepareAndRunBenchmark(PipeFiles[0], Key);
// The child process terminates in the above function, so we should never
// get to this point.
llvm_unreachable("Child process didn't exit when expected.");
}
const ExegesisTarget &ET = State.getExegesisTarget();
auto CounterOrError =
ET.createCounter(CounterName, State, ParentOrChildPID);
if (!CounterOrError)
return CounterOrError.takeError();
pfm::Counter *Counter = CounterOrError.get().get();
close(PipeFiles[0]);
// Make sure to attach to the process (and wait for the sigstop to be
// delivered and for the process to continue) before we write to the counter
// file descriptor. Attaching to the process before writing to the socket
// ensures that the subprocess at most has blocked on the read call. If we
// attach afterwards, the subprocess might exit before we get to the attach
// call due to effects like scheduler contention, introducing transient
// failures.
if (ptrace(PTRACE_ATTACH, ParentOrChildPID, NULL, NULL) != 0)
return make_error<Failure>("Failed to attach to the child process: " +
Twine(strerror(errno)));
if (wait(NULL) == -1) {
return make_error<Failure>(
"Failed to wait for child process to stop after attaching: " +
Twine(strerror(errno)));
}
if (ptrace(PTRACE_CONT, ParentOrChildPID, NULL, NULL) != 0)
return make_error<Failure>(
"Failed to continue execution of the child process: " +
Twine(strerror(errno)));
int CounterFileDescriptor = Counter->getFileDescriptor();
Error SendError =
sendFileDescriptorThroughSocket(PipeFiles[1], CounterFileDescriptor);
if (SendError)
return SendError;
int ChildStatus;
if (wait(&ChildStatus) == -1) {
return make_error<Failure>(
"Waiting for the child process to complete failed: " +
Twine(strerror(errno)));
}
if (WIFEXITED(ChildStatus)) {
int ChildExitCode = WEXITSTATUS(ChildStatus);
if (ChildExitCode == 0) {
// The child exited succesfully, read counter values and return
// success
CounterValues[0] = Counter->read();
return Error::success();
}
// The child exited, but not successfully
return make_error<Failure>(
"Child benchmarking process exited with non-zero exit code: " +
childProcessExitCodeToString(ChildExitCode));
}
// An error was encountered running the snippet, process it
siginfo_t ChildSignalInfo;
if (ptrace(PTRACE_GETSIGINFO, ParentOrChildPID, NULL, &ChildSignalInfo) ==
-1) {
return make_error<Failure>("Getting signal info from the child failed: " +
Twine(strerror(errno)));
}
if (ChildSignalInfo.si_signo == SIGSEGV)
return make_error<SnippetSegmentationFault>(
reinterpret_cast<intptr_t>(ChildSignalInfo.si_addr));
return make_error<SnippetSignal>(ChildSignalInfo.si_signo);
}
void disableCoreDumps() const {
struct rlimit rlim;
rlim.rlim_cur = 0;
setrlimit(RLIMIT_CORE, &rlim);
}
[[noreturn]] void prepareAndRunBenchmark(int Pipe,
const BenchmarkKey &Key) const {
// Disable core dumps in the child process as otherwise everytime we
// encounter an execution failure like a segmentation fault, we will create
// a core dump. We report the information directly rather than require the
// user inspect a core dump.
disableCoreDumps();
// The following occurs within the benchmarking subprocess
pid_t ParentPID = getppid();
Expected<int> CounterFileDescriptorOrError =
getFileDescriptorFromSocket(Pipe);
if (!CounterFileDescriptorOrError)
exit(ChildProcessExitCodeE::CounterFDReadFailed);
int CounterFileDescriptor = *CounterFileDescriptorOrError;
// Glibc versions greater than 2.35 automatically call rseq during
// initialization. Unmapping the region that glibc sets up for this causes
// segfaults in the program Unregister the rseq region so that we can safely
// unmap it later
#ifdef GLIBC_INITS_RSEQ
long RseqDisableOutput =
syscall(SYS_rseq, (intptr_t)__builtin_thread_pointer() + __rseq_offset,
__rseq_size, RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
if (RseqDisableOutput != 0)
exit(ChildProcessExitCodeE::RSeqDisableFailed);
#endif // GLIBC_INITS_RSEQ
// The frontend that generates the memory annotation structures should
// validate that the address to map the snippet in at is a multiple of
// the page size. Assert that this is true here.
assert(Key.SnippetAddress % getpagesize() == 0 &&
"The snippet address needs to be aligned to a page boundary.");
size_t FunctionDataCopySize = this->Function.FunctionBytes.size();
void *MapAddress = NULL;
int MapFlags = MAP_PRIVATE | MAP_ANONYMOUS;
if (Key.SnippetAddress != 0) {
MapAddress = reinterpret_cast<void *>(Key.SnippetAddress);
MapFlags |= MAP_FIXED_NOREPLACE;
}
char *FunctionDataCopy =
(char *)mmap(MapAddress, FunctionDataCopySize, PROT_READ | PROT_WRITE,
MapFlags, 0, 0);
if ((intptr_t)FunctionDataCopy == -1)
exit(ChildProcessExitCodeE::FunctionDataMappingFailed);
memcpy(FunctionDataCopy, this->Function.FunctionBytes.data(),
this->Function.FunctionBytes.size());
mprotect(FunctionDataCopy, FunctionDataCopySize, PROT_READ | PROT_EXEC);
Expected<int> AuxMemFDOrError =
SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
Key.MemoryValues, ParentPID, CounterFileDescriptor);
if (!AuxMemFDOrError)
exit(ChildProcessExitCodeE::AuxiliaryMemorySetupFailed);
((void (*)(size_t, int))(intptr_t)FunctionDataCopy)(FunctionDataCopySize,
*AuxMemFDOrError);
exit(0);
}
Expected<llvm::SmallVector<int64_t, 4>>
runWithCounter(StringRef CounterName) const override {
SmallVector<int64_t, 4> Value(1, 0);
Error PossibleBenchmarkError =
createSubProcessAndRunBenchmark(CounterName, Value);
if (PossibleBenchmarkError) {
return std::move(PossibleBenchmarkError);
}
return Value;
}
const LLVMState &State;
const ExecutableFunction Function;
const BenchmarkKey &Key;
};
#endif // __linux__
} // namespace
Expected<SmallString<0>> BenchmarkRunner::assembleSnippet(
const BenchmarkCode &BC, const SnippetRepetitor &Repetitor,
unsigned MinInstructions, unsigned LoopBodySize,
bool GenerateMemoryInstructions) const {
const std::vector<MCInst> &Instructions = BC.Key.Instructions;
SmallString<0> Buffer;
raw_svector_ostream OS(Buffer);
if (Error E = assembleToStream(
State.getExegesisTarget(), State.createTargetMachine(), BC.LiveIns,
BC.Key.RegisterInitialValues,
Repetitor.Repeat(Instructions, MinInstructions, LoopBodySize,
GenerateMemoryInstructions),
OS, BC.Key, GenerateMemoryInstructions)) {
return std::move(E);
}
return Buffer;
}
Expected<BenchmarkRunner::RunnableConfiguration>
BenchmarkRunner::getRunnableConfiguration(
const BenchmarkCode &BC, unsigned NumRepetitions, unsigned LoopBodySize,
const SnippetRepetitor &Repetitor) const {
RunnableConfiguration RC;
Benchmark &InstrBenchmark = RC.InstrBenchmark;
InstrBenchmark.Mode = Mode;
InstrBenchmark.CpuName = std::string(State.getTargetMachine().getTargetCPU());
InstrBenchmark.LLVMTriple =
State.getTargetMachine().getTargetTriple().normalize();
InstrBenchmark.NumRepetitions = NumRepetitions;
InstrBenchmark.Info = BC.Info;
const std::vector<MCInst> &Instructions = BC.Key.Instructions;
bool GenerateMemoryInstructions = ExecutionMode == ExecutionModeE::SubProcess;
InstrBenchmark.Key = BC.Key;
// Assemble at least kMinInstructionsForSnippet instructions by repeating
// the snippet for debug/analysis. This is so that the user clearly
// understands that the inside instructions are repeated.
if (BenchmarkPhaseSelector > BenchmarkPhaseSelectorE::PrepareSnippet) {
const int MinInstructionsForSnippet = 4 * Instructions.size();
const int LoopBodySizeForSnippet = 2 * Instructions.size();
auto Snippet =
assembleSnippet(BC, Repetitor, MinInstructionsForSnippet,
LoopBodySizeForSnippet, GenerateMemoryInstructions);
if (Error E = Snippet.takeError())
return std::move(E);
if (auto Err = getBenchmarkFunctionBytes(*Snippet,
InstrBenchmark.AssembledSnippet))
return std::move(Err);
}
// Assemble NumRepetitions instructions repetitions of the snippet for
// measurements.
if (BenchmarkPhaseSelector >
BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
auto Snippet = assembleSnippet(BC, Repetitor, InstrBenchmark.NumRepetitions,
LoopBodySize, GenerateMemoryInstructions);
if (Error E = Snippet.takeError())
return std::move(E);
RC.ObjectFile = getObjectFromBuffer(*Snippet);
}
return std::move(RC);
}
Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>>
BenchmarkRunner::createFunctionExecutor(
object::OwningBinary<object::ObjectFile> ObjectFile,
const BenchmarkKey &Key) const {
switch (ExecutionMode) {
case ExecutionModeE::InProcess: {
auto InProcessExecutorOrErr = InProcessFunctionExecutorImpl::create(
State, std::move(ObjectFile), Scratch.get());
if (!InProcessExecutorOrErr)
return InProcessExecutorOrErr.takeError();
return std::move(*InProcessExecutorOrErr);
}
case ExecutionModeE::SubProcess: {
#ifdef __linux__
auto SubProcessExecutorOrErr = SubProcessFunctionExecutorImpl::create(
State, std::move(ObjectFile), Key);
if (!SubProcessExecutorOrErr)
return SubProcessExecutorOrErr.takeError();
return std::move(*SubProcessExecutorOrErr);
#else
return make_error<Failure>(
"The subprocess execution mode is only supported on Linux");
#endif
}
}
llvm_unreachable("ExecutionMode is outside expected range");
}
std::pair<Error, Benchmark> BenchmarkRunner::runConfiguration(
RunnableConfiguration &&RC,
const std::optional<StringRef> &DumpFile) const {
Benchmark &InstrBenchmark = RC.InstrBenchmark;
object::OwningBinary<object::ObjectFile> &ObjectFile = RC.ObjectFile;
if (DumpFile && BenchmarkPhaseSelector >
BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
auto ObjectFilePath =
writeObjectFile(ObjectFile.getBinary()->getData(), *DumpFile);
if (Error E = ObjectFilePath.takeError()) {
return {std::move(E), std::move(InstrBenchmark)};
}
outs() << "Check generated assembly with: /usr/bin/objdump -d "
<< *ObjectFilePath << "\n";
}
if (BenchmarkPhaseSelector < BenchmarkPhaseSelectorE::Measure) {
InstrBenchmark.Error = "actual measurements skipped.";
return {Error::success(), std::move(InstrBenchmark)};
}
Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>> Executor =
createFunctionExecutor(std::move(ObjectFile), RC.InstrBenchmark.Key);
if (!Executor)
return {Executor.takeError(), std::move(InstrBenchmark)};
auto NewMeasurements = runMeasurements(**Executor);
if (Error E = NewMeasurements.takeError()) {
return {std::move(E), std::move(InstrBenchmark)};
}
assert(InstrBenchmark.NumRepetitions > 0 && "invalid NumRepetitions");
for (BenchmarkMeasure &BM : *NewMeasurements) {
// Scale the measurements by instruction.
BM.PerInstructionValue /= InstrBenchmark.NumRepetitions;
// Scale the measurements by snippet.
BM.PerSnippetValue *=
static_cast<double>(InstrBenchmark.Key.Instructions.size()) /
InstrBenchmark.NumRepetitions;
}
InstrBenchmark.Measurements = std::move(*NewMeasurements);
return {Error::success(), std::move(InstrBenchmark)};
}
Expected<std::string>
BenchmarkRunner::writeObjectFile(StringRef Buffer, StringRef FileName) const {
int ResultFD = 0;
SmallString<256> ResultPath = FileName;
if (Error E = errorCodeToError(
FileName.empty() ? sys::fs::createTemporaryFile("snippet", "o",
ResultFD, ResultPath)
: sys::fs::openFileForReadWrite(
FileName, ResultFD, sys::fs::CD_CreateAlways,
sys::fs::OF_None)))
return std::move(E);
raw_fd_ostream OFS(ResultFD, true /*ShouldClose*/);
OFS.write(Buffer.data(), Buffer.size());
OFS.flush();
return std::string(ResultPath.str());
}
BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {}
} // namespace exegesis
} // namespace llvm