[bolt][aarch64] Change indirect call instrumentation snippet (#180229)

Indirect call instrumentation snippet uses x16 register in exit handler
to go to destination target

    __bolt_instr_ind_call_handler_func:
            msr  nzcv, x1
            ldp  x0, x1, [sp], #16
            ldr  x16, [sp], #16
            ldp  x0, x1, [sp], #16
            br   x16	<-----

This patch adds the instrumentation snippet by calling instrumentation
runtime library through indirect call instruction and adding the wrapper
to store/load target value and the register for original indirect
instruction.

Example:
            mov x16, foo

    infirectCall:
            adrp x8, Label
            add  x8, x8, #:lo12:Label
            blr x8

Before:

    Instrumented indirect call:
            stp     x0, x1, [sp, #-16]!
            mov     x0, x8
            movk    x1, #0x0, lsl #48
            movk    x1, #0x0, lsl #32
            movk    x1, #0x0, lsl #16
            movk    x1, #0x0
            stp     x0, x1, [sp, #-16]!
            adrp    x0, __bolt_instr_ind_call_handler_func
            add     x0, x0, #:lo12:__bolt_instr_ind_call_handler_func
            blr     x0

    __bolt_instr_ind_call_handler:  (exit snippet)
            msr     nzcv, x1
            ldp     x0, x1, [sp], #16
            ldr     x16, [sp], #16
            ldp     x0, x1, [sp], #16
            br      x16    <- overwrites the original value in X16

    __bolt_instr_ind_call_handler_func:  (entry snippet)
            stp     x0, x1, [sp, #-16]!
            mrs     x1, nzcv
            adrp    x0, __bolt_instr_ind_call_handler
            add     x0, x0, x0, #:lo12:__bolt_instr_ind_call_handler
            ldr     x0, [x0]
            cmp     x0, #0x0
            b.eq    __bolt_instr_ind_call_handler
            str     x30, [sp, #-16]!
            blr     x0     <--- runtime lib store/load all regs
            ldr     x30, [sp], #16
            b       __bolt_instr_ind_call_handler


_________________________________________________________________________

After:

            mov     x16, foo
    infirectCall:
            adrp    x8, Label
            add     x8, x8, #:lo12:Label
            blr     x8

    Instrumented indirect call:
            stp     x0, x30, [sp, #-16]!
            mov     x0, callsiteid
            stp    x8, x0, [sp, #-16]!
            adrp    x8, __bolt_instr_ind_call_handler_func
            add     x8, x8, #:lo12:__bolt_instr_ind_call_handler_func
            blr     x8       <--- call trampoline instr lib
            ldr     x8, [sp], #16
            ldp     x0, x30, [sp], #16
            blr     x8       <--- original indirect call instruction

    // don't touch regs besides x0, x1
    __bolt_instr_ind_call_handler:  (exit snippet)
            ret     <---- return to original function with indirect call

    __bolt_instr_ind_call_handler_func: (entry snippet)
            adrp    x0, __bolt_instr_ind_call_handler
            add     x0, x0, #:lo12:__bolt_instr_ind_call_handler
            ldr     x0, [x0]
            cmp     x0, #0x0
            b.eq    __bolt_instr_ind_call_handler
            str     x30, [sp, #-16]!
            blr     x0     <--- runtime lib store/load all regs
            ldr     x30, [sp], #16
            b       __bolt_instr_ind_call_handler
This commit is contained in:
Alexey Moksyakov 2026-02-16 10:45:08 +03:00 committed by GitHub
parent edfe43cc9e
commit 12b561a5e2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 203 additions and 83 deletions

View File

@ -545,6 +545,11 @@ public:
llvm_unreachable("not implemented");
}
virtual void createDirectBranch(MCInst &Inst, const MCSymbol *Target,
MCContext *Ctx) {
llvm_unreachable("not implemented");
}
virtual MCPhysReg getX86R11() const { llvm_unreachable("not implemented"); }
virtual unsigned getShortBranchOpcode(unsigned Opcode) const {
@ -2399,7 +2404,7 @@ public:
virtual InstructionListType
createInstrumentedIndirectCall(MCInst &&CallInst, MCSymbol *HandlerFuncAddr,
int CallSiteID, MCContext *Ctx) {
size_t CallSiteID, MCContext *Ctx) {
llvm_unreachable("not implemented");
return InstructionListType();
}

View File

@ -48,14 +48,14 @@ static cl::opt<bool> NoLSEAtomics(
namespace {
static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) {
[[maybe_unused]] static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) {
Inst.setOpcode(AArch64::MRS);
Inst.clear();
Inst.addOperand(MCOperand::createReg(RegName));
Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV));
}
static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) {
[[maybe_unused]] static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) {
Inst.setOpcode(AArch64::MSR);
Inst.clear();
Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV));
@ -2413,6 +2413,14 @@ public:
return isCompAndBranch(Inst);
}
void createDirectBranch(MCInst &Inst, const MCSymbol *Target,
MCContext *Ctx) override {
Inst.setOpcode(AArch64::B);
Inst.clear();
Inst.addOperand(MCOperand::createExpr(getTargetExprFor(
Inst, MCSymbolRefExpr::create(Target, *Ctx), *Ctx, 0)));
}
bool analyzeBranch(InstructionIterator Begin, InstructionIterator End,
const MCSymbol *&TBB, const MCSymbol *&FBB,
MCInst *&CondBranch,
@ -2770,21 +2778,14 @@ public:
}
InstructionListType createInstrumentedIndCallHandlerExitBB() const override {
InstructionListType Insts(5);
// Code sequence for instrumented indirect call handler:
// msr nzcv, x1
// ldp x0, x1, [sp], #16
// ldr x16, [sp], #16
// ldp x0, x1, [sp], #16
// br x16
setSystemFlag(Insts[0], AArch64::X1);
createPopRegisters(Insts[1], AArch64::X0, AArch64::X1);
// Here we load address of the next function which should be called in the
// original binary to X16 register. Writing to X16 is permitted without
// needing to restore.
loadReg(Insts[2], AArch64::X16, AArch64::SP);
createPopRegisters(Insts[3], AArch64::X0, AArch64::X1);
createIndirectBranch(Insts[4], AArch64::X16, 0);
// ret
InstructionListType Insts;
Insts.emplace_back();
createReturn(Insts.back());
return Insts;
}
@ -2837,14 +2838,28 @@ public:
InstructionListType createLoadImmediate(const MCPhysReg Dest,
uint64_t Imm) const override {
InstructionListType Insts(4);
int Shift = 48;
for (int I = 0; I < 4; I++, Shift -= 16) {
Insts[I].setOpcode(AArch64::MOVKXi);
Insts[I].addOperand(MCOperand::createReg(Dest));
Insts[I].addOperand(MCOperand::createReg(Dest));
Insts[I].addOperand(MCOperand::createImm((Imm >> Shift) & 0xFFFF));
Insts[I].addOperand(MCOperand::createImm(Shift));
InstructionListType Insts;
Insts.emplace_back();
MCInst &Inst = Insts.back();
Inst.clear();
Inst.setOpcode(AArch64::MOVZXi);
Inst.addOperand(MCOperand::createReg(Dest));
Inst.addOperand(MCOperand::createImm(Imm & 0xFFFF));
Inst.addOperand(MCOperand::createImm(0));
int Shift = 16;
for (int I = 0; I < 3; I++, Shift += 16) {
const uint64_t ImmVal = (Imm >> Shift) & 0xFFFF;
if (!ImmVal)
continue;
Insts.emplace_back();
MCInst &Inst = Insts.back();
Inst.setOpcode(AArch64::MOVKXi);
Inst.addOperand(MCOperand::createReg(Dest));
Inst.addOperand(MCOperand::createReg(Dest));
Inst.addOperand(MCOperand::createImm(ImmVal));
Inst.addOperand(MCOperand::createImm(Shift));
}
return Insts;
}
@ -2858,41 +2873,48 @@ public:
InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst,
MCSymbol *HandlerFuncAddr,
int CallSiteID,
size_t CallSiteID,
MCContext *Ctx) override {
InstructionListType Insts;
// Code sequence used to enter indirect call instrumentation helper:
// stp x0, x1, [sp, #-16]! createPushRegisters
// mov target x0 convertIndirectCallToLoad -> orr x0 target xzr
// mov x1 CallSiteID createLoadImmediate ->
// movk x1, #0x0, lsl #48
// movk x1, #0x0, lsl #32
// movk x1, #0x0, lsl #16
// movk x1, #0x0
// stp x0, x1, [sp, #-16]!
// bl *HandlerFuncAddr createIndirectCall ->
// adr x0 *HandlerFuncAddr -> adrp + add
// blr x0
// snippet requires 2 registers: target address and call site id
// stp CallIDReg, x30, [sp, #-16]!
// movz/k CallIDReg, CallSiteID
// stp TAReg, CallIDReg, [sp, #-16]! ; push address and id for lib
// adr + add TAReg, *HandlerFuncAddr ; __bolt_instr_ind_call_handler_func
// blr TAReg
// ldr TAReg, [sp], #16 ; restore target address
// ldp CallIDReg, x30, [sp], #16
// blr TAReg
const MCRegister TAReg = CallInst.getOperand(0).getReg();
const MCRegister CallIDReg =
TAReg != AArch64::X0 ? AArch64::X0 : AArch64::X1;
InstructionListType Insts;
Insts.emplace_back();
createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1);
Insts.emplace_back(CallInst);
convertIndirectCallToLoad(Insts.back(), AArch64::X0);
InstructionListType LoadImm =
createLoadImmediate(getIntArgRegister(1), CallSiteID);
createPushRegisters(Insts.back(), CallIDReg, AArch64::LR);
InstructionListType LoadImm = createLoadImmediate(CallIDReg, CallSiteID);
Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end());
Insts.emplace_back();
createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1);
createPushRegisters(Insts.back(), TAReg, CallIDReg);
Insts.resize(Insts.size() + 2);
InstructionListType Addr =
materializeAddress(HandlerFuncAddr, Ctx, AArch64::X0);
InstructionListType Addr = materializeAddress(HandlerFuncAddr, Ctx, TAReg);
assert(Addr.size() == 2 && "Invalid Addr size");
std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size());
Insts.emplace_back();
createIndirectCallInst(Insts.back(), isTailCall(CallInst), AArch64::X0);
// Carry over metadata including tail call marker if present.
stripAnnotations(Insts.back());
moveAnnotations(std::move(CallInst), Insts.back());
Insts.emplace_back();
createIndirectCallInst(Insts.back(), false, TAReg);
Insts.emplace_back();
loadReg(Insts.back(), TAReg, getStackPointer());
Insts.emplace_back();
createPopRegisters(Insts.back(), CallIDReg, AArch64::LR);
Insts.emplace_back(CallInst);
return Insts;
}
@ -2901,12 +2923,10 @@ public:
createInstrumentedIndCallHandlerEntryBB(const MCSymbol *InstrTrampoline,
const MCSymbol *IndCallHandler,
MCContext *Ctx) override {
// Code sequence used to check whether InstrTampoline was initialized
// Code sequence used to check whether InstrTrampoline was initialized
// and call it if so, returns via IndCallHandler
// stp x0, x1, [sp, #-16]!
// mrs x1, nzcv
// adr x0, InstrTrampoline -> adrp + add
// ldr x0, [x0]
// adrp x0, InstrTrampoline
// ldr x0, [x0, #lo12:InstrTrampoline]
// subs x0, x0, #0x0
// b.eq IndCallHandler
// str x30, [sp, #-16]!
@ -2914,30 +2934,42 @@ public:
// ldr x30, [sp], #16
// b IndCallHandler
InstructionListType Insts;
// load handler address
MCInst InstAdrp;
InstAdrp.setOpcode(AArch64::ADRP);
InstAdrp.addOperand(MCOperand::createReg(getIntArgRegister(0)));
InstAdrp.addOperand(MCOperand::createImm(0));
setOperandToSymbolRef(InstAdrp, /* OpNum */ 1, InstrTrampoline,
/* Addend */ 0, Ctx, ELF::R_AARCH64_ADR_GOT_PAGE);
Insts.emplace_back(InstAdrp);
MCInst InstLoad;
InstLoad.setOpcode(AArch64::LDRXui);
InstLoad.addOperand(MCOperand::createReg(getIntArgRegister(0)));
InstLoad.addOperand(MCOperand::createReg(getIntArgRegister(0)));
InstLoad.addOperand(MCOperand::createImm(0));
setOperandToSymbolRef(InstLoad, /* OpNum */ 2, InstrTrampoline,
/* Addend */ 0, Ctx, ELF::R_AARCH64_LD64_GOT_LO12_NC);
Insts.emplace_back(InstLoad);
InstructionListType CmpJmp =
createCmpJE(getIntArgRegister(0), 0, IndCallHandler, Ctx);
Insts.insert(Insts.end(), CmpJmp.begin(), CmpJmp.end());
Insts.emplace_back();
createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1);
Insts.emplace_back();
getSystemFlag(Insts.back(), getIntArgRegister(1));
Insts.emplace_back();
Insts.emplace_back();
InstructionListType Addr =
materializeAddress(InstrTrampoline, Ctx, AArch64::X0);
std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size());
assert(Addr.size() == 2 && "Invalid Addr size");
Insts.emplace_back();
loadReg(Insts.back(), AArch64::X0, AArch64::X0);
InstructionListType cmpJmp =
createCmpJE(AArch64::X0, 0, IndCallHandler, Ctx);
Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end());
Insts.emplace_back();
storeReg(Insts.back(), AArch64::LR, AArch64::SP);
storeReg(Insts.back(), AArch64::LR, getStackPointer());
Insts.emplace_back();
Insts.back().setOpcode(AArch64::BLR);
Insts.back().addOperand(MCOperand::createReg(AArch64::X0));
Insts.back().addOperand(MCOperand::createReg(getIntArgRegister(0)));
Insts.emplace_back();
loadReg(Insts.back(), AArch64::LR, AArch64::SP);
loadReg(Insts.back(), AArch64::LR, getStackPointer());
Insts.emplace_back();
createDirectCall(Insts.back(), IndCallHandler, Ctx, /*IsTailCall*/ true);
createDirectBranch(Insts.back(), IndCallHandler, Ctx);
return Insts;
}

View File

@ -821,7 +821,7 @@ public:
InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst,
MCSymbol *HandlerFuncAddr,
int CallSiteID,
size_t CallSiteID,
MCContext *Ctx) override {
// Code sequence used to enter indirect call instrumentation helper:
// addi sp, sp, -0x10

View File

@ -3123,7 +3123,7 @@ public:
InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst,
MCSymbol *HandlerFuncAddr,
int CallSiteID,
size_t CallSiteID,
MCContext *Ctx) override {
// Check if the target address expression used in the original indirect call
// uses the stack pointer, which we are going to clobber.

View File

@ -1696,7 +1696,7 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
#if defined(__aarch64__)
// clang-format off
__asm__ __volatile__(SAVE_ALL
"ldp x0, x1, [sp, #288]\n"
"ldp x0, x1, [sp, #272]\n"
"bl instrumentIndirectCall\n"
RESTORE_ALL
"ret\n"
@ -1733,7 +1733,7 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall()
#if defined(__aarch64__)
// clang-format off
__asm__ __volatile__(SAVE_ALL
"ldp x0, x1, [sp, #288]\n"
"ldp x0, x1, [sp, #272]\n"
"bl instrumentIndirectCall\n"
RESTORE_ALL
"ret\n"

View File

@ -18,10 +18,12 @@
"stp x24, x25, [sp, #-16]!\n" \
"stp x26, x27, [sp, #-16]!\n" \
"stp x28, x29, [sp, #-16]!\n" \
"str x30, [sp,#-16]!\n"
"mrs x29, nzcv\n" \
"stp x29, x30, [sp, #-16]!\n"
// Mirrors SAVE_ALL
#define RESTORE_ALL \
"ldr x30, [sp], #16\n" \
"ldp x29, x30, [sp], #16\n" \
"msr nzcv, x29\n" \
"ldp x28, x29, [sp], #16\n" \
"ldp x26, x27, [sp], #16\n" \
"ldp x24, x25, [sp], #16\n" \

View File

@ -4,20 +4,96 @@ typedef int (*func_ptr)(int, int);
int add(int a, int b) { return a + b; }
int getConst(int a, int b) { return 0xaa55; }
void foo() {
// clang-format off
__asm__ __volatile("stp x29, x30 [sp, #-16]!\n"
"adrp x0, getConst\n"
"add x0, x0, :lo12:getConst\n"
"blr x0\n"
"ldp x29, x30 [sp], #16\n"
:::);
// clang-format on
return;
}
int main() {
func_ptr fun;
fun = add;
int sum = fun(10, 20); // indirect call to 'add'
printf("The sum is: %d\n", sum);
foo();
return 0;
}
/*
REQUIRES: system-linux,bolt-runtime
RUN: %clang %cflags %s -o %t.exe -Wl,-q -no-pie -fpie
RUN: llvm-objdump --disassemble-symbols=main %t.exe \
RUN: | FileCheck %s --check-prefix=CHECKINDIRECTREG
CHECKINDIRECTREG: mov w0, #0xa
CHECKINDIRECTREG-NEXT: mov w1, #0x14
CHECKINDIRECTREG-NEXT: blr x8
RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \
RUN: -o %t.instrumented
RUN: -o %t.instrumented \
RUN: | FileCheck %s --check-prefix=CHECK-LOG
CHECK-LOG-NOT: BOLT-INSTRUMENTER: Number of indirect call site descriptors: 0
RUN: llvm-objdump --disassemble-symbols=main %t.instrumented \
RUN: | FileCheck %s --check-prefix=CHECK-MAIN
RUN: llvm-objdump --disassemble-symbols=foo %t.instrumented \
RUN: | FileCheck %s --check-prefix=CHECK-FOO
RUN: llvm-objdump --disassemble-symbols=__bolt_instr_ind_call_handler \
RUN: %t.instrumented | FileCheck %s --check-prefix=CHECK-INSTR-INDIR-CALL
RUN: llvm-objdump --disassemble-symbols=__bolt_instr_ind_call_handler_func \
RUN: %t.instrumented | FileCheck %s --check-prefix=CHECK-INSTR-INDIR-CALL-FUNC
CHECK-MAIN: mov w0, #0xa
CHECK-MAIN-NEXT: mov w1, #0x14
// store current values
CHECK-MAIN-NEXT: stp x0, x30, [sp
// load callsite id
CHECK-MAIN-NEXT: mov x0, #0x0
CHECK-MAIN-NEXT: stp x8, x0, [sp
CHECK-MAIN-NEXT: adrp x8,
CHECK-MAIN-NEXT: add x8, x8
// call instrumentation library handler function
CHECK-MAIN-NEXT: blr x8
// restore registers saved before
CHECK-MAIN-NEXT: ldr x8, [sp]
CHECK-MAIN-NEXT: ldp x0, x30, [sp]
// original indirect call instruction
CHECK-MAIN-NEXT: blr x8
// instrumented pattern for blr x0
CHECK-FOO: stp x1, x30, [sp
CHECK-FOO-NEXT: mov x1,
CHECK-FOO-NEXT: stp x0, x1, [sp
CHECK-FOO-NEXT: adrp x0
CHECK-FOO-NEXT: add x0, x0
CHECK-FOO-NEXT: blr x0
CHECK-FOO-NEXT: ldr x0, [sp]
CHECK-FOO-NEXT: ldp x1, x30, [sp]
CHECK-FOO-NEXT: blr x0
CHECK-INSTR-INDIR-CALL: __bolt_instr_ind_call_handler>:
CHECK-INSTR-INDIR-CALL-NEXT: ret
CHECK-INSTR-INDIR-CALL-FUNC: __bolt_instr_ind_call_handler_func>:
CHECK-INSTR-INDIR-CALL-FUNC-NEXT: adrp x0
CHECK-INSTR-INDIR-CALL-FUNC-NEXT: ldr x0
CHECK-INSTR-INDIR-CALL-FUNC-NEXT: cmp x0, #0x0
CHECK-INSTR-INDIR-CALL-FUNC-NEXT: b.eq{{.*}}__bolt_instr_ind_call_handler
CHECK-INSTR-INDIR-CALL-FUNC-NEXT: str x30
CHECK-INSTR-INDIR-CALL-FUNC-NEXT: blr x0
CHECK-INSTR-INDIR-CALL-FUNC-NEXT: ldr x30
CHECK-INSTR-INDIR-CALL-FUNC-NEXT: b{{.*}}__bolt_instr_ind_call_handler
# Instrumented program needs to finish returning zero
RUN: %t.instrumented | FileCheck %s -check-prefix=CHECK-OUTPUT
@ -25,7 +101,7 @@ RUN: %t.instrumented | FileCheck %s -check-prefix=CHECK-OUTPUT
# Test that the instrumented data makes sense
RUN: llvm-bolt %t.exe -o %t.bolted --data %t.fdata \
RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \
RUN: --print-only=main --print-finalized | FileCheck %s
RUN: --print-only=main,foo --print-finalized | FileCheck %s
RUN: %t.bolted | FileCheck %s -check-prefix=CHECK-OUTPUT
@ -33,6 +109,11 @@ CHECK-OUTPUT: The sum is: 30
# Check that our indirect call has 1 hit recorded in the fdata file and that
# this was processed correctly by BOLT
CHECK-LABEL: Binary Function "foo"
CHECK: blr x0 # CallProfile: 1 (0 misses) :
CHECK-NEXT: { getConst: 1 (0 misses) }
CHECK-LABEL: Binary Function "main"
CHECK: blr x8 # CallProfile: 1 (0 misses) :
CHECK-NEXT: { add: 1 (0 misses) }
*/