//===-- BenchmarkRunner.cpp -------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include #include #include "Assembler.h" #include "BenchmarkRunner.h" #include "Error.h" #include "MCInstrDescView.h" #include "PerfHelper.h" #include "SubprocessMemory.h" #include "Target.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/CrashRecoveryContext.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Program.h" #include "llvm/Support/Signals.h" #ifdef __linux__ #ifdef HAVE_LIBPFM #include #endif #include #include #include #include #include #include #include #if defined(__GLIBC__) && __has_include() && defined(HAVE_BUILTIN_THREAD_POINTER) #include #if defined(RSEQ_SIG) && defined(SYS_rseq) #define GLIBC_INITS_RSEQ #endif #endif #endif namespace llvm { namespace exegesis { BenchmarkRunner::BenchmarkRunner(const LLVMState &State, Benchmark::ModeE Mode, BenchmarkPhaseSelectorE BenchmarkPhaseSelector, ExecutionModeE ExecutionMode) : State(State), Mode(Mode), BenchmarkPhaseSelector(BenchmarkPhaseSelector), ExecutionMode(ExecutionMode), Scratch(std::make_unique()) {} BenchmarkRunner::~BenchmarkRunner() = default; void BenchmarkRunner::FunctionExecutor::accumulateCounterValues( const llvm::SmallVectorImpl &NewValues, llvm::SmallVectorImpl *Result) { const size_t NumValues = std::max(NewValues.size(), Result->size()); if (NumValues > Result->size()) Result->resize(NumValues, 0); for (size_t I = 0, End = NewValues.size(); I < End; ++I) (*Result)[I] += NewValues[I]; } Expected> BenchmarkRunner::FunctionExecutor::runAndSample(const char *Counters) const { // We sum counts when there are several counters for a single ProcRes // (e.g. P23 on SandyBridge). llvm::SmallVector CounterValues; SmallVector CounterNames; StringRef(Counters).split(CounterNames, '+'); for (auto &CounterName : CounterNames) { CounterName = CounterName.trim(); Expected> ValueOrError = runWithCounter(CounterName); if (!ValueOrError) return ValueOrError.takeError(); accumulateCounterValues(ValueOrError.get(), &CounterValues); } return CounterValues; } namespace { class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { public: static Expected> create(const LLVMState &State, object::OwningBinary Obj, BenchmarkRunner::ScratchSpace *Scratch) { Expected EF = ExecutableFunction::create(State.createTargetMachine(), std::move(Obj)); if (!EF) return EF.takeError(); return std::unique_ptr( new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch)); } private: InProcessFunctionExecutorImpl(const LLVMState &State, ExecutableFunction Function, BenchmarkRunner::ScratchSpace *Scratch) : State(State), Function(std::move(Function)), Scratch(Scratch) {} static void accumulateCounterValues(const llvm::SmallVector &NewValues, llvm::SmallVector *Result) { const size_t NumValues = std::max(NewValues.size(), Result->size()); if (NumValues > Result->size()) Result->resize(NumValues, 0); for (size_t I = 0, End = NewValues.size(); I < End; ++I) (*Result)[I] += NewValues[I]; } Expected> runWithCounter(StringRef CounterName) const override { const ExegesisTarget &ET = State.getExegesisTarget(); char *const ScratchPtr = Scratch->ptr(); auto CounterOrError = ET.createCounter(CounterName, State); if (!CounterOrError) return CounterOrError.takeError(); pfm::Counter *Counter = CounterOrError.get().get(); Scratch->clear(); { auto PS = ET.withSavedState(); CrashRecoveryContext CRC; CrashRecoveryContext::Enable(); const bool Crashed = !CRC.RunSafely([this, Counter, ScratchPtr]() { Counter->start(); this->Function(ScratchPtr); Counter->stop(); }); CrashRecoveryContext::Disable(); PS.reset(); if (Crashed) { std::string Msg = "snippet crashed while running"; #ifdef LLVM_ON_UNIX // See "Exit Status for Commands": // https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xcu_chap02.html constexpr const int kSigOffset = 128; if (const char *const SigName = strsignal(CRC.RetCode - kSigOffset)) { Msg += ": "; Msg += SigName; } #endif return make_error(std::move(Msg)); } } return Counter->readOrError(Function.getFunctionBytes()); } const LLVMState &State; const ExecutableFunction Function; BenchmarkRunner::ScratchSpace *const Scratch; }; #ifdef __linux__ // The following class implements a function executor that executes the // benchmark code within a subprocess rather than within the main llvm-exegesis // process. This allows for much more control over the execution context of the // snippet, particularly with regard to memory. This class performs all the // necessary functions to create the subprocess, execute the snippet in the // subprocess, and report results/handle errors. class SubProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { public: static Expected> create(const LLVMState &State, object::OwningBinary Obj, const BenchmarkKey &Key) { Expected EF = ExecutableFunction::create(State.createTargetMachine(), std::move(Obj)); if (!EF) return EF.takeError(); return std::unique_ptr( new SubProcessFunctionExecutorImpl(State, std::move(*EF), Key)); } private: SubProcessFunctionExecutorImpl(const LLVMState &State, ExecutableFunction Function, const BenchmarkKey &Key) : State(State), Function(std::move(Function)), Key(Key) {} enum ChildProcessExitCodeE { CounterFDReadFailed = 1, RSeqDisableFailed, FunctionDataMappingFailed, AuxiliaryMemorySetupFailed }; StringRef childProcessExitCodeToString(int ExitCode) const { switch (ExitCode) { case ChildProcessExitCodeE::CounterFDReadFailed: return "Counter file descriptor read failed"; case ChildProcessExitCodeE::RSeqDisableFailed: return "Disabling restartable sequences failed"; case ChildProcessExitCodeE::FunctionDataMappingFailed: return "Failed to map memory for assembled snippet"; case ChildProcessExitCodeE::AuxiliaryMemorySetupFailed: return "Failed to setup auxiliary memory"; default: return "Child process returned with unknown exit code"; } } Error sendFileDescriptorThroughSocket(int SocketFD, int FD) const { struct msghdr Message = {}; char Buffer[CMSG_SPACE(sizeof(FD))]; memset(Buffer, 0, sizeof(Buffer)); Message.msg_control = Buffer; Message.msg_controllen = sizeof(Buffer); struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message); ControlMessage->cmsg_level = SOL_SOCKET; ControlMessage->cmsg_type = SCM_RIGHTS; ControlMessage->cmsg_len = CMSG_LEN(sizeof(FD)); memcpy(CMSG_DATA(ControlMessage), &FD, sizeof(FD)); Message.msg_controllen = CMSG_SPACE(sizeof(FD)); ssize_t BytesWritten = sendmsg(SocketFD, &Message, 0); if (BytesWritten < 0) return make_error("Failed to write FD to socket: " + Twine(strerror(errno))); return Error::success(); } Expected getFileDescriptorFromSocket(int SocketFD) const { struct msghdr Message = {}; char ControlBuffer[256]; Message.msg_control = ControlBuffer; Message.msg_controllen = sizeof(ControlBuffer); ssize_t BytesRead = recvmsg(SocketFD, &Message, 0); if (BytesRead < 0) return make_error("Failed to read FD from socket: " + Twine(strerror(errno))); struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message); int FD; if (ControlMessage->cmsg_len != CMSG_LEN(sizeof(FD))) return make_error("Failed to get correct number of bytes for " "file descriptor from socket."); memcpy(&FD, CMSG_DATA(ControlMessage), sizeof(FD)); return FD; } Error createSubProcessAndRunBenchmark( StringRef CounterName, SmallVectorImpl &CounterValues) const { int PipeFiles[2]; int PipeSuccessOrErr = socketpair(AF_UNIX, SOCK_DGRAM, 0, PipeFiles); if (PipeSuccessOrErr != 0) { return make_error( "Failed to create a pipe for interprocess communication between " "llvm-exegesis and the benchmarking subprocess: " + Twine(strerror(errno))); } SubprocessMemory SPMemory; Error MemoryInitError = SPMemory.initializeSubprocessMemory(getpid()); if (MemoryInitError) return MemoryInitError; Error AddMemDefError = SPMemory.addMemoryDefinition(Key.MemoryValues, getpid()); if (AddMemDefError) return AddMemDefError; pid_t ParentOrChildPID = fork(); if (ParentOrChildPID == -1) { return make_error("Failed to create child process: " + Twine(strerror(errno))); } if (ParentOrChildPID == 0) { // We are in the child process, close the write end of the pipe close(PipeFiles[1]); // Unregister handlers, signal handling is now handled through ptrace in // the host process llvm::sys::unregisterHandlers(); prepareAndRunBenchmark(PipeFiles[0], Key); // The child process terminates in the above function, so we should never // get to this point. llvm_unreachable("Child process didn't exit when expected."); } const ExegesisTarget &ET = State.getExegesisTarget(); auto CounterOrError = ET.createCounter(CounterName, State, ParentOrChildPID); if (!CounterOrError) return CounterOrError.takeError(); pfm::Counter *Counter = CounterOrError.get().get(); close(PipeFiles[0]); // Make sure to attach to the process (and wait for the sigstop to be // delivered and for the process to continue) before we write to the counter // file descriptor. Attaching to the process before writing to the socket // ensures that the subprocess at most has blocked on the read call. If we // attach afterwards, the subprocess might exit before we get to the attach // call due to effects like scheduler contention, introducing transient // failures. if (ptrace(PTRACE_ATTACH, ParentOrChildPID, NULL, NULL) != 0) return make_error("Failed to attach to the child process: " + Twine(strerror(errno))); if (wait(NULL) == -1) { return make_error( "Failed to wait for child process to stop after attaching: " + Twine(strerror(errno))); } if (ptrace(PTRACE_CONT, ParentOrChildPID, NULL, NULL) != 0) return make_error( "Failed to continue execution of the child process: " + Twine(strerror(errno))); int CounterFileDescriptor = Counter->getFileDescriptor(); Error SendError = sendFileDescriptorThroughSocket(PipeFiles[1], CounterFileDescriptor); if (SendError) return SendError; int ChildStatus; if (wait(&ChildStatus) == -1) { return make_error( "Waiting for the child process to complete failed: " + Twine(strerror(errno))); } if (WIFEXITED(ChildStatus)) { int ChildExitCode = WEXITSTATUS(ChildStatus); if (ChildExitCode == 0) { // The child exited succesfully, read counter values and return // success CounterValues[0] = Counter->read(); return Error::success(); } // The child exited, but not successfully return make_error( "Child benchmarking process exited with non-zero exit code: " + childProcessExitCodeToString(ChildExitCode)); } // An error was encountered running the snippet, process it siginfo_t ChildSignalInfo; if (ptrace(PTRACE_GETSIGINFO, ParentOrChildPID, NULL, &ChildSignalInfo) == -1) { return make_error("Getting signal info from the child failed: " + Twine(strerror(errno))); } return make_error( "The benchmarking subprocess sent unexpected signal: " + Twine(strsignal(ChildSignalInfo.si_signo))); } void disableCoreDumps() const { struct rlimit rlim; rlim.rlim_cur = 0; setrlimit(RLIMIT_CORE, &rlim); } [[noreturn]] void prepareAndRunBenchmark(int Pipe, const BenchmarkKey &Key) const { // Disable core dumps in the child process as otherwise everytime we // encounter an execution failure like a segmentation fault, we will create // a core dump. We report the information directly rather than require the // user inspect a core dump. disableCoreDumps(); // The following occurs within the benchmarking subprocess pid_t ParentPID = getppid(); Expected CounterFileDescriptorOrError = getFileDescriptorFromSocket(Pipe); if (!CounterFileDescriptorOrError) exit(ChildProcessExitCodeE::CounterFDReadFailed); int CounterFileDescriptor = *CounterFileDescriptorOrError; // Glibc versions greater than 2.35 automatically call rseq during // initialization. Unmapping the region that glibc sets up for this causes // segfaults in the program Unregister the rseq region so that we can safely // unmap it later #ifdef GLIBC_INITS_RSEQ long RseqDisableOutput = syscall(SYS_rseq, (intptr_t)__builtin_thread_pointer() + __rseq_offset, __rseq_size, RSEQ_FLAG_UNREGISTER, RSEQ_SIG); if (RseqDisableOutput != 0) exit(ChildProcessExitCodeE::RSeqDisableFailed); #endif // GLIBC_INITS_RSEQ size_t FunctionDataCopySize = this->Function.FunctionBytes.size(); char *FunctionDataCopy = (char *)mmap(NULL, FunctionDataCopySize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if ((intptr_t)FunctionDataCopy == -1) exit(ChildProcessExitCodeE::FunctionDataMappingFailed); memcpy(FunctionDataCopy, this->Function.FunctionBytes.data(), this->Function.FunctionBytes.size()); mprotect(FunctionDataCopy, FunctionDataCopySize, PROT_READ | PROT_EXEC); Expected AuxMemFDOrError = SubprocessMemory::setupAuxiliaryMemoryInSubprocess( Key.MemoryValues, ParentPID, CounterFileDescriptor); if (!AuxMemFDOrError) exit(ChildProcessExitCodeE::AuxiliaryMemorySetupFailed); ((void (*)(size_t, int))(intptr_t)FunctionDataCopy)(FunctionDataCopySize, *AuxMemFDOrError); exit(0); } Expected> runWithCounter(StringRef CounterName) const override { SmallVector Value(1, 0); Error PossibleBenchmarkError = createSubProcessAndRunBenchmark(CounterName, Value); if (PossibleBenchmarkError) { return std::move(PossibleBenchmarkError); } return Value; } const LLVMState &State; const ExecutableFunction Function; const BenchmarkKey &Key; }; #endif // __linux__ } // namespace Expected> BenchmarkRunner::assembleSnippet( const BenchmarkCode &BC, const SnippetRepetitor &Repetitor, unsigned MinInstructions, unsigned LoopBodySize, bool GenerateMemoryInstructions) const { const std::vector &Instructions = BC.Key.Instructions; SmallString<0> Buffer; raw_svector_ostream OS(Buffer); if (Error E = assembleToStream( State.getExegesisTarget(), State.createTargetMachine(), BC.LiveIns, BC.Key.RegisterInitialValues, Repetitor.Repeat(Instructions, MinInstructions, LoopBodySize, GenerateMemoryInstructions), OS, BC.Key, GenerateMemoryInstructions)) { return std::move(E); } return Buffer; } Expected BenchmarkRunner::getRunnableConfiguration( const BenchmarkCode &BC, unsigned NumRepetitions, unsigned LoopBodySize, const SnippetRepetitor &Repetitor) const { RunnableConfiguration RC; Benchmark &InstrBenchmark = RC.InstrBenchmark; InstrBenchmark.Mode = Mode; InstrBenchmark.CpuName = std::string(State.getTargetMachine().getTargetCPU()); InstrBenchmark.LLVMTriple = State.getTargetMachine().getTargetTriple().normalize(); InstrBenchmark.NumRepetitions = NumRepetitions; InstrBenchmark.Info = BC.Info; const std::vector &Instructions = BC.Key.Instructions; bool GenerateMemoryInstructions = ExecutionMode == ExecutionModeE::SubProcess; InstrBenchmark.Key = BC.Key; // Assemble at least kMinInstructionsForSnippet instructions by repeating // the snippet for debug/analysis. This is so that the user clearly // understands that the inside instructions are repeated. if (BenchmarkPhaseSelector > BenchmarkPhaseSelectorE::PrepareSnippet) { const int MinInstructionsForSnippet = 4 * Instructions.size(); const int LoopBodySizeForSnippet = 2 * Instructions.size(); auto Snippet = assembleSnippet(BC, Repetitor, MinInstructionsForSnippet, LoopBodySizeForSnippet, GenerateMemoryInstructions); if (Error E = Snippet.takeError()) return std::move(E); if (auto Err = getBenchmarkFunctionBytes(*Snippet, InstrBenchmark.AssembledSnippet)) return std::move(Err); } // Assemble NumRepetitions instructions repetitions of the snippet for // measurements. if (BenchmarkPhaseSelector > BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) { auto Snippet = assembleSnippet(BC, Repetitor, InstrBenchmark.NumRepetitions, LoopBodySize, GenerateMemoryInstructions); if (Error E = Snippet.takeError()) return std::move(E); RC.ObjectFile = getObjectFromBuffer(*Snippet); } return std::move(RC); } Expected> BenchmarkRunner::createFunctionExecutor( object::OwningBinary ObjectFile, const BenchmarkKey &Key) const { switch (ExecutionMode) { case ExecutionModeE::InProcess: { auto InProcessExecutorOrErr = InProcessFunctionExecutorImpl::create( State, std::move(ObjectFile), Scratch.get()); if (!InProcessExecutorOrErr) return InProcessExecutorOrErr.takeError(); return std::move(*InProcessExecutorOrErr); } case ExecutionModeE::SubProcess: { #ifdef __linux__ auto SubProcessExecutorOrErr = SubProcessFunctionExecutorImpl::create( State, std::move(ObjectFile), Key); if (!SubProcessExecutorOrErr) return SubProcessExecutorOrErr.takeError(); return std::move(*SubProcessExecutorOrErr); #else return make_error( "The subprocess execution mode is only supported on Linux"); #endif } } llvm_unreachable("ExecutionMode is outside expected range"); } Expected BenchmarkRunner::runConfiguration( RunnableConfiguration &&RC, const std::optional &DumpFile) const { Benchmark &InstrBenchmark = RC.InstrBenchmark; object::OwningBinary &ObjectFile = RC.ObjectFile; if (DumpFile && BenchmarkPhaseSelector > BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) { auto ObjectFilePath = writeObjectFile(ObjectFile.getBinary()->getData(), *DumpFile); if (Error E = ObjectFilePath.takeError()) { InstrBenchmark.Error = toString(std::move(E)); return std::move(InstrBenchmark); } outs() << "Check generated assembly with: /usr/bin/objdump -d " << *ObjectFilePath << "\n"; } if (BenchmarkPhaseSelector < BenchmarkPhaseSelectorE::Measure) { InstrBenchmark.Error = "actual measurements skipped."; return std::move(InstrBenchmark); } Expected> Executor = createFunctionExecutor(std::move(ObjectFile), RC.InstrBenchmark.Key); if (!Executor) return Executor.takeError(); auto NewMeasurements = runMeasurements(**Executor); if (Error E = NewMeasurements.takeError()) { if (!E.isA()) return std::move(E); InstrBenchmark.Error = toString(std::move(E)); return std::move(InstrBenchmark); } assert(InstrBenchmark.NumRepetitions > 0 && "invalid NumRepetitions"); for (BenchmarkMeasure &BM : *NewMeasurements) { // Scale the measurements by instruction. BM.PerInstructionValue /= InstrBenchmark.NumRepetitions; // Scale the measurements by snippet. BM.PerSnippetValue *= static_cast(InstrBenchmark.Key.Instructions.size()) / InstrBenchmark.NumRepetitions; } InstrBenchmark.Measurements = std::move(*NewMeasurements); return std::move(InstrBenchmark); } Expected BenchmarkRunner::writeObjectFile(StringRef Buffer, StringRef FileName) const { int ResultFD = 0; SmallString<256> ResultPath = FileName; if (Error E = errorCodeToError( FileName.empty() ? sys::fs::createTemporaryFile("snippet", "o", ResultFD, ResultPath) : sys::fs::openFileForReadWrite( FileName, ResultFD, sys::fs::CD_CreateAlways, sys::fs::OF_None))) return std::move(E); raw_fd_ostream OFS(ResultFD, true /*ShouldClose*/); OFS.write(Buffer.data(), Buffer.size()); OFS.flush(); return std::string(ResultPath.str()); } BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {} } // namespace exegesis } // namespace llvm