llvm-project/flang-rt/lib/runtime/trampoline.cpp
Sairudra More b16e012603
[flang-rt] Fix macOS build: define _DARWIN_C_SOURCE for mmap flags (#186142)
On Darwin, `sys/mman.h` hides `MAP_JIT` and `MAP_ANON(YMOUS)` when
`_POSIX_C_SOURCE` is defined unless `_DARWIN_C_SOURCE` is also defined.
`trampoline.cpp` uses those flags, so this change defines
`_DARWIN_C_SOURCE` before including `<sys/mman.h>` in this file.

Fixes build failure reported in #183108.

Co-authored-by: Sairudra More <moresair@pe31.hpc.amslabs.hpecorp.net>
2026-03-25 10:23:31 +05:30

468 lines
17 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//===-- lib/runtime/trampoline.cpp -------------------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// W^X-compliant trampoline pool implementation.
//
// This file implements a runtime trampoline pool that maintains separate
// memory regions for executable code (RX) and writable data (RW).
//
// On Linux the code region transitions RW → RX (never simultaneously W+X).
// On macOS Apple Silicon the code region uses MAP_JIT with per-thread W^X
// toggling via pthread_jit_write_protect_np, so the mapping permissions
// include both W and X but hardware enforces that only one is active at
// a time on any given thread.
//
// Architecture:
// - Code region (RX): Contains pre-assembled trampoline stubs that load
// callee address and static chain from a paired TDATA entry, then jump
// to the callee with the static chain in the appropriate register.
// - Data region (RW): Contains TrampolineData entries with {callee_address,
// static_chain_address} pairs, one per trampoline slot.
// - Free list: Tracks available trampoline slots for O(1) alloc/free.
//
// Thread safety: Uses Fortran::runtime::Lock (pthreads on POSIX,
// CRITICAL_SECTION on Windows) — not std::mutex — to avoid C++ runtime
// library dependence. A single global lock serializes pool operations.
// This is a deliberate V1 design choice to keep the initial W^X
// architectural change minimal. Per-thread lock-free pools are deferred
// to a future optimization patch.
//
// AddressSanitizer note: The trampoline code region is allocated via
// mmap (not malloc/new), so ASan does not track it. The data region
// and handles are allocated via malloc (through AllocateMemoryOrCrash),
// which ASan intercepts normally. No special annotations are needed.
//
// See flang/docs/InternalProcedureTrampolines.md for design details.
//
//===----------------------------------------------------------------------===//
#include "flang/Runtime/trampoline.h"
#include "flang-rt/runtime/lock.h"
#include "flang-rt/runtime/memory.h"
#include "flang-rt/runtime/terminator.h"
#include "flang-rt/runtime/trampoline.h"
#include "flang/Runtime/freestanding-tools.h"
#include <atomic>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <cstring>
// Platform-specific headers for memory mapping.
#if defined(_WIN32)
#include <windows.h>
#else
// On macOS/Darwin, the flang-rt CMake configuration sets
// -D_POSIX_C_SOURCE=200809, which hides BSD/Apple-specific mmap flags
// (MAP_ANON, MAP_JIT) from <sys/mman.h>. Define _DARWIN_C_SOURCE to
// re-expose them for MAP_JIT on Apple Silicon and MAP_ANON elsewhere.
#if defined(__APPLE__) && !defined(_DARWIN_C_SOURCE)
#define _DARWIN_C_SOURCE
#endif
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
// Some platforms (e.g. AIX) define MAP_ANON instead of MAP_ANONYMOUS.
#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif
// macOS Apple Silicon requires MAP_JIT and pthread_jit_write_protect_np
// to create executable memory under the hardened runtime.
#if defined(__APPLE__) && defined(__aarch64__)
#include <libkern/OSCacheControl.h>
#include <pthread.h>
#endif
// Architecture support check. Stub generators exist only for x86-64 and
// AArch64. On other architectures the file compiles but the runtime API
// functions crash with a diagnostic if actually called, so that building
// flang-rt on e.g. RISC-V or PPC64 never fails.
#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || \
defined(_M_ARM64)
#define TRAMPOLINE_ARCH_SUPPORTED 1
#else
#define TRAMPOLINE_ARCH_SUPPORTED 0
#endif
namespace Fortran::runtime::trampoline {
/// A handle returned to the caller. Contains enough info to find
/// both the trampoline stub and its data entry.
struct TrampolineHandle {
void *codePtr{nullptr}; // Pointer to the trampoline stub in the RX region.
std::size_t slotIndex{0}; // Index in the pool for free-list management.
};
// Namespace-scope globals following Flang runtime conventions:
// - Lock is trivially constructible (pthread_mutex_t / CRITICAL_SECTION)
// - Pool pointer uses std::atomic for safe double-checked locking
class TrampolinePool; // Forward declaration for pointer below.
static Lock poolLock;
static std::atomic<TrampolinePool *> poolInstance{nullptr};
/// The global trampoline pool.
class TrampolinePool {
public:
TrampolinePool() = default;
static TrampolinePool &instance() {
TrampolinePool *p{poolInstance.load(std::memory_order_acquire)};
if (p) {
return *p;
}
CriticalSection critical{poolLock};
p = poolInstance.load(std::memory_order_relaxed);
if (p) {
return *p;
}
// Allocate pool using SizedNew (malloc + placement new).
Terminator terminator{__FILE__, __LINE__};
auto owning{SizedNew<TrampolinePool>{terminator}(sizeof(TrampolinePool))};
p = owning.release();
poolInstance.store(p, std::memory_order_release);
return *p;
}
/// Allocate a trampoline slot and initialize it.
TrampolineHandle *allocate(
const void *calleeAddress, const void *staticChainAddress) {
CriticalSection critical{lock_};
ensureInitialized();
if (freeHead_ == kInvalidIndex) {
// Pool exhausted — fixed size by design for V1.
// The pool capacity is controlled by FLANG_TRAMPOLINE_POOL_SIZE
// (default 1024). Dynamic slab growth can be added in a follow-up
// patch if real workloads demonstrate a need for it.
Terminator terminator{__FILE__, __LINE__};
terminator.Crash("Trampoline pool exhausted (max %zu slots). "
"Set FLANG_TRAMPOLINE_POOL_SIZE to increase.",
poolSize_);
}
std::size_t index{freeHead_};
freeHead_ = freeList_[index];
// Initialize the data entry.
dataRegion_[index].calleeAddress = calleeAddress;
dataRegion_[index].staticChainAddress = staticChainAddress;
// Create handle using SizedNew (malloc + placement new).
Terminator terminator{__FILE__, __LINE__};
auto owning{New<TrampolineHandle>{terminator}()};
TrampolineHandle *handle{owning.release()};
handle->codePtr =
static_cast<char *>(codeRegion_) + index * kTrampolineStubSize;
handle->slotIndex = index;
return handle;
}
/// Get the callable address of a trampoline.
void *getCallableAddress(TrampolineHandle *handle) { return handle->codePtr; }
/// Free a trampoline slot.
void free(TrampolineHandle *handle) {
CriticalSection critical{lock_};
std::size_t index{handle->slotIndex};
// Poison the data entry so that any dangling call through a freed
// trampoline traps immediately. Setting to NULL means the stub will
// jump to address 0, which is unmapped on all supported platforms
// and produces SIGSEGV/SIGBUS immediately.
dataRegion_[index].calleeAddress = nullptr;
dataRegion_[index].staticChainAddress = nullptr;
// Return slot to free list.
freeList_[index] = freeHead_;
freeHead_ = index;
FreeMemory(handle);
}
private:
static constexpr std::size_t kInvalidIndex{~std::size_t{0}};
void ensureInitialized() {
if (initialized_) {
return;
}
initialized_ = true;
// Check environment variable for pool size override.
// Fixed-size pool by design (V1): avoids complexity of dynamic growth
// and re-protection of code pages. The default (1024 slots) is
// sufficient for typical Fortran programs. Users can override via:
// export FLANG_TRAMPOLINE_POOL_SIZE=4096
if (const char *envSize = std::getenv("FLANG_TRAMPOLINE_POOL_SIZE")) {
long val{std::strtol(envSize, nullptr, 10)};
if (val > 0) {
poolSize_ = {static_cast<std::size_t>(val)};
}
}
// Allocate the data region (RW).
Terminator terminator{__FILE__, __LINE__};
dataRegion_ = static_cast<TrampolineData *>(
AllocateMemoryOrCrash(terminator, poolSize_ * sizeof(TrampolineData)));
runtime::memset(dataRegion_, 0, poolSize_ * sizeof(TrampolineData));
// Allocate the code region (initially RW for writing stubs, then RX).
std::size_t codeSize{poolSize_ * kTrampolineStubSize};
#if defined(_WIN32)
codeRegion_ = VirtualAlloc(
nullptr, codeSize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
#elif defined(__APPLE__) && defined(__aarch64__)
// macOS Apple Silicon: MAP_JIT is required for pages that will become
// executable. Use pthread_jit_write_protect_np to toggle W↔X.
codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_JIT, -1, 0);
if (codeRegion_ == MAP_FAILED) {
codeRegion_ = nullptr;
}
if (codeRegion_) {
// Enable writing on this thread (MAP_JIT defaults to execute).
// Guard for deployment targets older than macOS 11.0 (Apple Silicon
// always runs >= 11.0, so this is effectively unconditional at runtime).
if (__builtin_available(macOS 11.0, *)) {
pthread_jit_write_protect_np(0); // 0 = writable
}
}
#elif defined(MAP_ANONYMOUS)
// Linux and other POSIX platforms with MAP_ANONYMOUS.
codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (codeRegion_ == MAP_FAILED) {
codeRegion_ = nullptr;
}
#else
// Platforms without MAP_ANONYMOUS or MAP_ANON (e.g. AIX): map /dev/zero
// as a portable anonymous-mapping equivalent (per POSIX).
{
int devZero{open("/dev/zero", O_RDONLY)};
if (devZero >= 0) {
codeRegion_ = mmap(
nullptr, codeSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, devZero, 0);
if (codeRegion_ == MAP_FAILED) {
codeRegion_ = nullptr;
}
close(devZero);
}
}
#endif
if (!codeRegion_) {
terminator.Crash("Failed to allocate trampoline code region");
}
// Generate trampoline stubs.
generateStubs();
// Flush instruction cache. Required on architectures with non-coherent
// I-cache/D-cache (AArch64, PPC, etc.). On x86-64 this is a no-op
// but harmless. Without this, AArch64 may execute stale instructions.
#if defined(__APPLE__) && defined(__aarch64__)
// On macOS, use sys_icache_invalidate (from libkern/OSCacheControl.h).
sys_icache_invalidate(codeRegion_, codeSize);
#elif defined(_WIN32)
FlushInstructionCache(GetCurrentProcess(), codeRegion_, codeSize);
#else
__builtin___clear_cache(static_cast<char *>(codeRegion_),
static_cast<char *>(codeRegion_) + codeSize);
#endif
// Make code region executable and non-writable (W^X).
#if defined(_WIN32)
DWORD oldProtect;
VirtualProtect(codeRegion_, codeSize, PAGE_EXECUTE_READ, &oldProtect);
#elif defined(__APPLE__) && defined(__aarch64__)
// Switch back to execute-only (MAP_JIT manages per-thread W^X).
if (__builtin_available(macOS 11.0, *)) {
pthread_jit_write_protect_np(1); // 1 = executable
}
#else
mprotect(codeRegion_, codeSize, PROT_READ | PROT_EXEC);
#endif
// Initialize free list.
freeList_ = static_cast<std::size_t *>(
AllocateMemoryOrCrash(terminator, poolSize_ * sizeof(std::size_t)));
for (std::size_t i{0}; i < poolSize_ - 1; ++i) {
freeList_[i] = i + 1;
}
freeList_[poolSize_ - 1] = kInvalidIndex;
freeHead_ = 0;
}
/// Generate platform-specific trampoline stubs in the code region.
/// Each stub loads callee address and static chain from its paired
/// TDATA entry and jumps to the callee.
void generateStubs() {
#if defined(__x86_64__) || defined(_M_X64)
generateStubsX86_64();
#elif defined(__aarch64__) || defined(_M_ARM64)
generateStubsAArch64();
#else
// Unsupported architecture — should never be reached because the
// extern "C" API functions guard with TRAMPOLINE_ARCH_SUPPORTED.
// Fill with trap bytes as a safety net.
runtime::memset(codeRegion_, 0, poolSize_ * kTrampolineStubSize);
#endif
}
#if defined(__x86_64__) || defined(_M_X64)
/// Generate x86-64 trampoline stubs.
///
/// Each stub does:
/// movabsq $dataEntry, %r11 ; load TDATA entry address
/// movq 8(%r11), %r10 ; load static chain -> nest register
/// jmpq *(%r11) ; jump to callee address
///
/// Total: 10 + 4 + 3 = 17 bytes, padded to kTrampolineStubSize.
void generateStubsX86_64() {
auto *code{static_cast<uint8_t *>(codeRegion_)};
for (std::size_t i{0}; i < poolSize_; ++i) {
uint8_t *stub{code + i * kTrampolineStubSize};
// Address of the corresponding TDATA entry.
auto dataAddr{reinterpret_cast<uint64_t>(&dataRegion_[i])};
std::size_t off{0};
// movabsq $dataAddr, %r11 (REX.W + B, opcode 0xBB for r11)
stub[off++] = 0x49; // REX.WB
stub[off++] = 0xBB; // MOV r11, imm64
runtime::memcpy(&stub[off], &dataAddr, 8);
off += 8;
// movq 8(%r11), %r10 (load staticChainAddress into r10)
stub[off++] = 0x4D; // REX.WRB
stub[off++] = 0x8B; // MOV r/m64 -> r64
stub[off++] = 0x53; // ModRM: [r11 + disp8], r10
stub[off++] = 0x08; // disp8 = 8
// jmpq *(%r11) (jump to calleeAddress)
stub[off++] = 0x41; // REX.B
stub[off++] = 0xFF; // JMP r/m64
stub[off++] = 0x23; // ModRM: [r11], opcode extension 4
// Pad the rest with INT3 (0xCC) for safety.
while (off < kTrampolineStubSize) {
stub[off++] = 0xCC;
}
}
}
#endif
#if defined(__aarch64__) || defined(_M_ARM64)
/// Generate AArch64 trampoline stubs.
///
/// Each stub does:
/// ldr x17, .Ldata_addr ; load TDATA entry address
/// ldr x15, [x17, #8] ; load static chain -> x15 (nest reg)
/// ldr x17, [x17] ; load callee address
/// br x17 ; jump to callee
/// .Ldata_addr:
/// .quad <address of dataRegion_[i]>
///
/// Total: 4*4 + 8 = 24 bytes, padded to kTrampolineStubSize.
void generateStubsAArch64() {
auto *code{static_cast<uint8_t *>(codeRegion_)};
for (std::size_t i{0}; i < poolSize_; ++i) {
auto *stub{reinterpret_cast<uint32_t *>(code + i * kTrampolineStubSize)};
// Address of the corresponding TDATA entry.
auto dataAddr{reinterpret_cast<uint64_t>(&dataRegion_[i])};
// ldr x17, .Ldata_addr (PC-relative load, offset = 4 instructions = 16
// bytes) LDR (literal): opc=01, V=0, imm19=(16/4)=4, Rt=17
stub[0] = 0x58000091; // ldr x17, #16 (imm19=4, shifted left 2 = 16)
// Encoding: 0101 1000 0000 0000 0000 0000 1001 0001
// ldr x15, [x17, #8] (load static chain into x15, the nest register)
// LDR (unsigned offset): size=11, V=0, opc=01, imm12=1(×8), Rn=17, Rt=15
stub[1] = 0xF940062F; // ldr x15, [x17, #8]
// ldr x17, [x17] (load callee address)
// LDR (unsigned offset): size=11, V=0, opc=01, imm12=0, Rn=17, Rt=17
stub[2] = 0xF9400231; // ldr x17, [x17, #0]
// br x17
stub[3] = 0xD61F0220; // br x17
// .Ldata_addr: .quad dataRegion_[i]
runtime::memcpy(&stub[4], &dataAddr, 8);
// Pad remaining with BRK #0 (trap) for safety.
std::size_t usedWords{4 + 2}; // 4 instructions + 1 quad (2 words)
for (std::size_t w{usedWords}; w < kTrampolineStubSize / sizeof(uint32_t);
++w) {
stub[w] = 0xD4200000; // brk #0
}
}
}
#endif
Lock lock_;
bool initialized_{false};
std::size_t poolSize_{kDefaultPoolSize};
void *codeRegion_{nullptr}; // RX after initialization
TrampolineData *dataRegion_{nullptr}; // RW always
std::size_t *freeList_{nullptr}; // Intrusive free list
std::size_t freeHead_{kInvalidIndex};
};
} // namespace Fortran::runtime::trampoline
namespace Fortran::runtime {
extern "C" {
// Helper: crash with a clear message on unsupported architectures.
// This is only reached if -fsafe-trampoline was used on a target
// that lacks stub generators. The driver should emit a warning and
// ignore the flag on unsupported architectures, but the runtime
// provides a safety net.
static inline void crashIfUnsupported() {
#if !TRAMPOLINE_ARCH_SUPPORTED
Terminator terminator{__FILE__, __LINE__};
terminator.Crash("Runtime trampolines are not supported on this "
"architecture. Recompile without -fsafe-trampoline "
"to use the legacy stack-trampoline path.");
#endif
}
void *RTDEF(TrampolineInit)(
void *scratch, const void *calleeAddress, const void *staticChainAddress) {
crashIfUnsupported();
auto &pool{trampoline::TrampolinePool::instance()};
return pool.allocate(calleeAddress, staticChainAddress);
}
void *RTDEF(TrampolineAdjust)(void *handle) {
crashIfUnsupported();
auto &pool{trampoline::TrampolinePool::instance()};
return pool.getCallableAddress(
static_cast<trampoline::TrampolineHandle *>(handle));
}
void RTDEF(TrampolineFree)(void *handle) {
crashIfUnsupported();
auto &pool{trampoline::TrampolinePool::instance()};
pool.free(static_cast<trampoline::TrampolineHandle *>(handle));
}
} // extern "C"
} // namespace Fortran::runtime