On Darwin, `sys/mman.h` hides `MAP_JIT` and `MAP_ANON(YMOUS)` when `_POSIX_C_SOURCE` is defined unless `_DARWIN_C_SOURCE` is also defined. `trampoline.cpp` uses those flags, so this change defines `_DARWIN_C_SOURCE` before including `<sys/mman.h>` in this file. Fixes build failure reported in #183108. Co-authored-by: Sairudra More <moresair@pe31.hpc.amslabs.hpecorp.net>
468 lines
17 KiB
C++
468 lines
17 KiB
C++
//===-- lib/runtime/trampoline.cpp -------------------------------*- C++-*-===//
|
||
//
|
||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
// See https://llvm.org/LICENSE.txt for license information.
|
||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
//
|
||
// W^X-compliant trampoline pool implementation.
|
||
//
|
||
// This file implements a runtime trampoline pool that maintains separate
|
||
// memory regions for executable code (RX) and writable data (RW).
|
||
//
|
||
// On Linux the code region transitions RW → RX (never simultaneously W+X).
|
||
// On macOS Apple Silicon the code region uses MAP_JIT with per-thread W^X
|
||
// toggling via pthread_jit_write_protect_np, so the mapping permissions
|
||
// include both W and X but hardware enforces that only one is active at
|
||
// a time on any given thread.
|
||
//
|
||
// Architecture:
|
||
// - Code region (RX): Contains pre-assembled trampoline stubs that load
|
||
// callee address and static chain from a paired TDATA entry, then jump
|
||
// to the callee with the static chain in the appropriate register.
|
||
// - Data region (RW): Contains TrampolineData entries with {callee_address,
|
||
// static_chain_address} pairs, one per trampoline slot.
|
||
// - Free list: Tracks available trampoline slots for O(1) alloc/free.
|
||
//
|
||
// Thread safety: Uses Fortran::runtime::Lock (pthreads on POSIX,
|
||
// CRITICAL_SECTION on Windows) — not std::mutex — to avoid C++ runtime
|
||
// library dependence. A single global lock serializes pool operations.
|
||
// This is a deliberate V1 design choice to keep the initial W^X
|
||
// architectural change minimal. Per-thread lock-free pools are deferred
|
||
// to a future optimization patch.
|
||
//
|
||
// AddressSanitizer note: The trampoline code region is allocated via
|
||
// mmap (not malloc/new), so ASan does not track it. The data region
|
||
// and handles are allocated via malloc (through AllocateMemoryOrCrash),
|
||
// which ASan intercepts normally. No special annotations are needed.
|
||
//
|
||
// See flang/docs/InternalProcedureTrampolines.md for design details.
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
|
||
#include "flang/Runtime/trampoline.h"
|
||
#include "flang-rt/runtime/lock.h"
|
||
#include "flang-rt/runtime/memory.h"
|
||
#include "flang-rt/runtime/terminator.h"
|
||
#include "flang-rt/runtime/trampoline.h"
|
||
#include "flang/Runtime/freestanding-tools.h"
|
||
|
||
#include <atomic>
|
||
#include <cassert>
|
||
#include <cstdint>
|
||
#include <cstdlib>
|
||
#include <cstring>
|
||
|
||
// Platform-specific headers for memory mapping.
|
||
#if defined(_WIN32)
|
||
#include <windows.h>
|
||
#else
|
||
// On macOS/Darwin, the flang-rt CMake configuration sets
|
||
// -D_POSIX_C_SOURCE=200809, which hides BSD/Apple-specific mmap flags
|
||
// (MAP_ANON, MAP_JIT) from <sys/mman.h>. Define _DARWIN_C_SOURCE to
|
||
// re-expose them for MAP_JIT on Apple Silicon and MAP_ANON elsewhere.
|
||
#if defined(__APPLE__) && !defined(_DARWIN_C_SOURCE)
|
||
#define _DARWIN_C_SOURCE
|
||
#endif
|
||
#include <fcntl.h>
|
||
#include <sys/mman.h>
|
||
#include <unistd.h>
|
||
// Some platforms (e.g. AIX) define MAP_ANON instead of MAP_ANONYMOUS.
|
||
#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
|
||
#define MAP_ANONYMOUS MAP_ANON
|
||
#endif
|
||
#endif
|
||
|
||
// macOS Apple Silicon requires MAP_JIT and pthread_jit_write_protect_np
|
||
// to create executable memory under the hardened runtime.
|
||
#if defined(__APPLE__) && defined(__aarch64__)
|
||
#include <libkern/OSCacheControl.h>
|
||
#include <pthread.h>
|
||
#endif
|
||
|
||
// Architecture support check. Stub generators exist only for x86-64 and
|
||
// AArch64. On other architectures the file compiles but the runtime API
|
||
// functions crash with a diagnostic if actually called, so that building
|
||
// flang-rt on e.g. RISC-V or PPC64 never fails.
|
||
#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || \
|
||
defined(_M_ARM64)
|
||
#define TRAMPOLINE_ARCH_SUPPORTED 1
|
||
#else
|
||
#define TRAMPOLINE_ARCH_SUPPORTED 0
|
||
#endif
|
||
|
||
namespace Fortran::runtime::trampoline {
|
||
|
||
/// A handle returned to the caller. Contains enough info to find
|
||
/// both the trampoline stub and its data entry.
|
||
struct TrampolineHandle {
|
||
void *codePtr{nullptr}; // Pointer to the trampoline stub in the RX region.
|
||
std::size_t slotIndex{0}; // Index in the pool for free-list management.
|
||
};
|
||
|
||
// Namespace-scope globals following Flang runtime conventions:
|
||
// - Lock is trivially constructible (pthread_mutex_t / CRITICAL_SECTION)
|
||
// - Pool pointer uses std::atomic for safe double-checked locking
|
||
class TrampolinePool; // Forward declaration for pointer below.
|
||
static Lock poolLock;
|
||
static std::atomic<TrampolinePool *> poolInstance{nullptr};
|
||
|
||
/// The global trampoline pool.
|
||
class TrampolinePool {
|
||
public:
|
||
TrampolinePool() = default;
|
||
|
||
static TrampolinePool &instance() {
|
||
TrampolinePool *p{poolInstance.load(std::memory_order_acquire)};
|
||
if (p) {
|
||
return *p;
|
||
}
|
||
CriticalSection critical{poolLock};
|
||
p = poolInstance.load(std::memory_order_relaxed);
|
||
if (p) {
|
||
return *p;
|
||
}
|
||
// Allocate pool using SizedNew (malloc + placement new).
|
||
Terminator terminator{__FILE__, __LINE__};
|
||
auto owning{SizedNew<TrampolinePool>{terminator}(sizeof(TrampolinePool))};
|
||
p = owning.release();
|
||
poolInstance.store(p, std::memory_order_release);
|
||
return *p;
|
||
}
|
||
|
||
/// Allocate a trampoline slot and initialize it.
|
||
TrampolineHandle *allocate(
|
||
const void *calleeAddress, const void *staticChainAddress) {
|
||
CriticalSection critical{lock_};
|
||
ensureInitialized();
|
||
|
||
if (freeHead_ == kInvalidIndex) {
|
||
// Pool exhausted — fixed size by design for V1.
|
||
// The pool capacity is controlled by FLANG_TRAMPOLINE_POOL_SIZE
|
||
// (default 1024). Dynamic slab growth can be added in a follow-up
|
||
// patch if real workloads demonstrate a need for it.
|
||
Terminator terminator{__FILE__, __LINE__};
|
||
terminator.Crash("Trampoline pool exhausted (max %zu slots). "
|
||
"Set FLANG_TRAMPOLINE_POOL_SIZE to increase.",
|
||
poolSize_);
|
||
}
|
||
|
||
std::size_t index{freeHead_};
|
||
freeHead_ = freeList_[index];
|
||
|
||
// Initialize the data entry.
|
||
dataRegion_[index].calleeAddress = calleeAddress;
|
||
dataRegion_[index].staticChainAddress = staticChainAddress;
|
||
|
||
// Create handle using SizedNew (malloc + placement new).
|
||
Terminator terminator{__FILE__, __LINE__};
|
||
auto owning{New<TrampolineHandle>{terminator}()};
|
||
TrampolineHandle *handle{owning.release()};
|
||
handle->codePtr =
|
||
static_cast<char *>(codeRegion_) + index * kTrampolineStubSize;
|
||
handle->slotIndex = index;
|
||
|
||
return handle;
|
||
}
|
||
|
||
/// Get the callable address of a trampoline.
|
||
void *getCallableAddress(TrampolineHandle *handle) { return handle->codePtr; }
|
||
|
||
/// Free a trampoline slot.
|
||
void free(TrampolineHandle *handle) {
|
||
CriticalSection critical{lock_};
|
||
|
||
std::size_t index{handle->slotIndex};
|
||
|
||
// Poison the data entry so that any dangling call through a freed
|
||
// trampoline traps immediately. Setting to NULL means the stub will
|
||
// jump to address 0, which is unmapped on all supported platforms
|
||
// and produces SIGSEGV/SIGBUS immediately.
|
||
dataRegion_[index].calleeAddress = nullptr;
|
||
dataRegion_[index].staticChainAddress = nullptr;
|
||
|
||
// Return slot to free list.
|
||
freeList_[index] = freeHead_;
|
||
freeHead_ = index;
|
||
|
||
FreeMemory(handle);
|
||
}
|
||
|
||
private:
|
||
static constexpr std::size_t kInvalidIndex{~std::size_t{0}};
|
||
|
||
void ensureInitialized() {
|
||
if (initialized_) {
|
||
return;
|
||
}
|
||
initialized_ = true;
|
||
|
||
// Check environment variable for pool size override.
|
||
// Fixed-size pool by design (V1): avoids complexity of dynamic growth
|
||
// and re-protection of code pages. The default (1024 slots) is
|
||
// sufficient for typical Fortran programs. Users can override via:
|
||
// export FLANG_TRAMPOLINE_POOL_SIZE=4096
|
||
if (const char *envSize = std::getenv("FLANG_TRAMPOLINE_POOL_SIZE")) {
|
||
long val{std::strtol(envSize, nullptr, 10)};
|
||
if (val > 0) {
|
||
poolSize_ = {static_cast<std::size_t>(val)};
|
||
}
|
||
}
|
||
|
||
// Allocate the data region (RW).
|
||
Terminator terminator{__FILE__, __LINE__};
|
||
dataRegion_ = static_cast<TrampolineData *>(
|
||
AllocateMemoryOrCrash(terminator, poolSize_ * sizeof(TrampolineData)));
|
||
runtime::memset(dataRegion_, 0, poolSize_ * sizeof(TrampolineData));
|
||
|
||
// Allocate the code region (initially RW for writing stubs, then RX).
|
||
std::size_t codeSize{poolSize_ * kTrampolineStubSize};
|
||
#if defined(_WIN32)
|
||
codeRegion_ = VirtualAlloc(
|
||
nullptr, codeSize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
|
||
#elif defined(__APPLE__) && defined(__aarch64__)
|
||
// macOS Apple Silicon: MAP_JIT is required for pages that will become
|
||
// executable. Use pthread_jit_write_protect_np to toggle W↔X.
|
||
codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE | PROT_EXEC,
|
||
MAP_PRIVATE | MAP_ANONYMOUS | MAP_JIT, -1, 0);
|
||
if (codeRegion_ == MAP_FAILED) {
|
||
codeRegion_ = nullptr;
|
||
}
|
||
if (codeRegion_) {
|
||
// Enable writing on this thread (MAP_JIT defaults to execute).
|
||
// Guard for deployment targets older than macOS 11.0 (Apple Silicon
|
||
// always runs >= 11.0, so this is effectively unconditional at runtime).
|
||
if (__builtin_available(macOS 11.0, *)) {
|
||
pthread_jit_write_protect_np(0); // 0 = writable
|
||
}
|
||
}
|
||
#elif defined(MAP_ANONYMOUS)
|
||
// Linux and other POSIX platforms with MAP_ANONYMOUS.
|
||
codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE,
|
||
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||
if (codeRegion_ == MAP_FAILED) {
|
||
codeRegion_ = nullptr;
|
||
}
|
||
#else
|
||
// Platforms without MAP_ANONYMOUS or MAP_ANON (e.g. AIX): map /dev/zero
|
||
// as a portable anonymous-mapping equivalent (per POSIX).
|
||
{
|
||
int devZero{open("/dev/zero", O_RDONLY)};
|
||
if (devZero >= 0) {
|
||
codeRegion_ = mmap(
|
||
nullptr, codeSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, devZero, 0);
|
||
if (codeRegion_ == MAP_FAILED) {
|
||
codeRegion_ = nullptr;
|
||
}
|
||
close(devZero);
|
||
}
|
||
}
|
||
#endif
|
||
if (!codeRegion_) {
|
||
terminator.Crash("Failed to allocate trampoline code region");
|
||
}
|
||
|
||
// Generate trampoline stubs.
|
||
generateStubs();
|
||
|
||
// Flush instruction cache. Required on architectures with non-coherent
|
||
// I-cache/D-cache (AArch64, PPC, etc.). On x86-64 this is a no-op
|
||
// but harmless. Without this, AArch64 may execute stale instructions.
|
||
#if defined(__APPLE__) && defined(__aarch64__)
|
||
// On macOS, use sys_icache_invalidate (from libkern/OSCacheControl.h).
|
||
sys_icache_invalidate(codeRegion_, codeSize);
|
||
#elif defined(_WIN32)
|
||
FlushInstructionCache(GetCurrentProcess(), codeRegion_, codeSize);
|
||
#else
|
||
__builtin___clear_cache(static_cast<char *>(codeRegion_),
|
||
static_cast<char *>(codeRegion_) + codeSize);
|
||
#endif
|
||
|
||
// Make code region executable and non-writable (W^X).
|
||
#if defined(_WIN32)
|
||
DWORD oldProtect;
|
||
VirtualProtect(codeRegion_, codeSize, PAGE_EXECUTE_READ, &oldProtect);
|
||
#elif defined(__APPLE__) && defined(__aarch64__)
|
||
// Switch back to execute-only (MAP_JIT manages per-thread W^X).
|
||
if (__builtin_available(macOS 11.0, *)) {
|
||
pthread_jit_write_protect_np(1); // 1 = executable
|
||
}
|
||
#else
|
||
mprotect(codeRegion_, codeSize, PROT_READ | PROT_EXEC);
|
||
#endif
|
||
|
||
// Initialize free list.
|
||
freeList_ = static_cast<std::size_t *>(
|
||
AllocateMemoryOrCrash(terminator, poolSize_ * sizeof(std::size_t)));
|
||
|
||
for (std::size_t i{0}; i < poolSize_ - 1; ++i) {
|
||
freeList_[i] = i + 1;
|
||
}
|
||
freeList_[poolSize_ - 1] = kInvalidIndex;
|
||
freeHead_ = 0;
|
||
}
|
||
|
||
/// Generate platform-specific trampoline stubs in the code region.
|
||
/// Each stub loads callee address and static chain from its paired
|
||
/// TDATA entry and jumps to the callee.
|
||
void generateStubs() {
|
||
#if defined(__x86_64__) || defined(_M_X64)
|
||
generateStubsX86_64();
|
||
#elif defined(__aarch64__) || defined(_M_ARM64)
|
||
generateStubsAArch64();
|
||
#else
|
||
// Unsupported architecture — should never be reached because the
|
||
// extern "C" API functions guard with TRAMPOLINE_ARCH_SUPPORTED.
|
||
// Fill with trap bytes as a safety net.
|
||
runtime::memset(codeRegion_, 0, poolSize_ * kTrampolineStubSize);
|
||
#endif
|
||
}
|
||
|
||
#if defined(__x86_64__) || defined(_M_X64)
|
||
/// Generate x86-64 trampoline stubs.
|
||
///
|
||
/// Each stub does:
|
||
/// movabsq $dataEntry, %r11 ; load TDATA entry address
|
||
/// movq 8(%r11), %r10 ; load static chain -> nest register
|
||
/// jmpq *(%r11) ; jump to callee address
|
||
///
|
||
/// Total: 10 + 4 + 3 = 17 bytes, padded to kTrampolineStubSize.
|
||
void generateStubsX86_64() {
|
||
auto *code{static_cast<uint8_t *>(codeRegion_)};
|
||
|
||
for (std::size_t i{0}; i < poolSize_; ++i) {
|
||
uint8_t *stub{code + i * kTrampolineStubSize};
|
||
|
||
// Address of the corresponding TDATA entry.
|
||
auto dataAddr{reinterpret_cast<uint64_t>(&dataRegion_[i])};
|
||
|
||
std::size_t off{0};
|
||
|
||
// movabsq $dataAddr, %r11 (REX.W + B, opcode 0xBB for r11)
|
||
stub[off++] = 0x49; // REX.WB
|
||
stub[off++] = 0xBB; // MOV r11, imm64
|
||
runtime::memcpy(&stub[off], &dataAddr, 8);
|
||
off += 8;
|
||
|
||
// movq 8(%r11), %r10 (load staticChainAddress into r10)
|
||
stub[off++] = 0x4D; // REX.WRB
|
||
stub[off++] = 0x8B; // MOV r/m64 -> r64
|
||
stub[off++] = 0x53; // ModRM: [r11 + disp8], r10
|
||
stub[off++] = 0x08; // disp8 = 8
|
||
|
||
// jmpq *(%r11) (jump to calleeAddress)
|
||
stub[off++] = 0x41; // REX.B
|
||
stub[off++] = 0xFF; // JMP r/m64
|
||
stub[off++] = 0x23; // ModRM: [r11], opcode extension 4
|
||
|
||
// Pad the rest with INT3 (0xCC) for safety.
|
||
while (off < kTrampolineStubSize) {
|
||
stub[off++] = 0xCC;
|
||
}
|
||
}
|
||
}
|
||
#endif
|
||
|
||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||
/// Generate AArch64 trampoline stubs.
|
||
///
|
||
/// Each stub does:
|
||
/// ldr x17, .Ldata_addr ; load TDATA entry address
|
||
/// ldr x15, [x17, #8] ; load static chain -> x15 (nest reg)
|
||
/// ldr x17, [x17] ; load callee address
|
||
/// br x17 ; jump to callee
|
||
/// .Ldata_addr:
|
||
/// .quad <address of dataRegion_[i]>
|
||
///
|
||
/// Total: 4*4 + 8 = 24 bytes, padded to kTrampolineStubSize.
|
||
void generateStubsAArch64() {
|
||
auto *code{static_cast<uint8_t *>(codeRegion_)};
|
||
|
||
for (std::size_t i{0}; i < poolSize_; ++i) {
|
||
auto *stub{reinterpret_cast<uint32_t *>(code + i * kTrampolineStubSize)};
|
||
|
||
// Address of the corresponding TDATA entry.
|
||
auto dataAddr{reinterpret_cast<uint64_t>(&dataRegion_[i])};
|
||
|
||
// ldr x17, .Ldata_addr (PC-relative load, offset = 4 instructions = 16
|
||
// bytes) LDR (literal): opc=01, V=0, imm19=(16/4)=4, Rt=17
|
||
stub[0] = 0x58000091; // ldr x17, #16 (imm19=4, shifted left 2 = 16)
|
||
// Encoding: 0101 1000 0000 0000 0000 0000 1001 0001
|
||
|
||
// ldr x15, [x17, #8] (load static chain into x15, the nest register)
|
||
// LDR (unsigned offset): size=11, V=0, opc=01, imm12=1(×8), Rn=17, Rt=15
|
||
stub[1] = 0xF940062F; // ldr x15, [x17, #8]
|
||
|
||
// ldr x17, [x17] (load callee address)
|
||
// LDR (unsigned offset): size=11, V=0, opc=01, imm12=0, Rn=17, Rt=17
|
||
stub[2] = 0xF9400231; // ldr x17, [x17, #0]
|
||
|
||
// br x17
|
||
stub[3] = 0xD61F0220; // br x17
|
||
|
||
// .Ldata_addr: .quad dataRegion_[i]
|
||
runtime::memcpy(&stub[4], &dataAddr, 8);
|
||
|
||
// Pad remaining with BRK #0 (trap) for safety.
|
||
std::size_t usedWords{4 + 2}; // 4 instructions + 1 quad (2 words)
|
||
for (std::size_t w{usedWords}; w < kTrampolineStubSize / sizeof(uint32_t);
|
||
++w) {
|
||
stub[w] = 0xD4200000; // brk #0
|
||
}
|
||
}
|
||
}
|
||
#endif
|
||
|
||
Lock lock_;
|
||
bool initialized_{false};
|
||
std::size_t poolSize_{kDefaultPoolSize};
|
||
|
||
void *codeRegion_{nullptr}; // RX after initialization
|
||
TrampolineData *dataRegion_{nullptr}; // RW always
|
||
std::size_t *freeList_{nullptr}; // Intrusive free list
|
||
std::size_t freeHead_{kInvalidIndex};
|
||
};
|
||
|
||
} // namespace Fortran::runtime::trampoline
|
||
|
||
namespace Fortran::runtime {
|
||
extern "C" {
|
||
|
||
// Helper: crash with a clear message on unsupported architectures.
|
||
// This is only reached if -fsafe-trampoline was used on a target
|
||
// that lacks stub generators. The driver should emit a warning and
|
||
// ignore the flag on unsupported architectures, but the runtime
|
||
// provides a safety net.
|
||
static inline void crashIfUnsupported() {
|
||
#if !TRAMPOLINE_ARCH_SUPPORTED
|
||
Terminator terminator{__FILE__, __LINE__};
|
||
terminator.Crash("Runtime trampolines are not supported on this "
|
||
"architecture. Recompile without -fsafe-trampoline "
|
||
"to use the legacy stack-trampoline path.");
|
||
#endif
|
||
}
|
||
|
||
void *RTDEF(TrampolineInit)(
|
||
void *scratch, const void *calleeAddress, const void *staticChainAddress) {
|
||
crashIfUnsupported();
|
||
auto &pool{trampoline::TrampolinePool::instance()};
|
||
return pool.allocate(calleeAddress, staticChainAddress);
|
||
}
|
||
|
||
void *RTDEF(TrampolineAdjust)(void *handle) {
|
||
crashIfUnsupported();
|
||
auto &pool{trampoline::TrampolinePool::instance()};
|
||
return pool.getCallableAddress(
|
||
static_cast<trampoline::TrampolineHandle *>(handle));
|
||
}
|
||
|
||
void RTDEF(TrampolineFree)(void *handle) {
|
||
crashIfUnsupported();
|
||
auto &pool{trampoline::TrampolinePool::instance()};
|
||
pool.free(static_cast<trampoline::TrampolineHandle *>(handle));
|
||
}
|
||
|
||
} // extern "C"
|
||
} // namespace Fortran::runtime
|