[Hexagon][XRay] Fix sled layout and trampoline to preserve retaddr (#188784)

The Hexagon XRay sled was 5 words (20 bytes) and the patched sequence
clobbered r31 (the link register) via callr without saving it first.
When the trampoline returned, the instrumented function's own allocframe
would then save the wrong return address, causing a crash or misrouted
return.

Expand the sled to 7 words (28 bytes) and wrap the call with
allocframe(#0)/deallocframe so the caller's r31:30 are preserved across
the trampoline call.

Detailed fixes:
- HexagonAsmPrinter: emit 6 nop words after the jump (7 words total)
- xray_hexagon.cpp: patch allocframe(#0) as first word, immext+r7 (func
ID), immext+r6 (trampoline), callr r6, deallocframe; write the first
word last for atomicity
- xray_trampoline_hexagon.S: complete rewrite -- properly load and
dereference the global handler pointer, save/restore r0-r5 and r31, add
stack frame with correct 8-byte alignment, add jumpr r31 to actually
return from trampolines
- xray_interface.cpp: update Hexagon cSledLength from 20 to 28
- Update lit tests for 6-nop sled
This commit is contained in:
Brian Cain 2026-03-30 09:25:14 -05:00 committed by GitHub
parent 3c1b9f278d
commit 5e7c66013b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 138 additions and 75 deletions

View File

@ -21,11 +21,13 @@ namespace __xray {
// The machine codes for some instructions used in runtime patching.
enum PatchOpcodes : uint32_t {
PO_JUMPI_14 = 0x5800c00a, // jump #0x014 (PC + 0x014)
PO_CALLR_R6 = 0x50a6c000, // indirect call: callr r6
PO_TFR_IMM = 0x78000000, // transfer immed
// ICLASS 0x7 - S2-type A-type
PO_IMMEXT = 0x00000000, // constant extender
PO_JUMPI_1C = 0x5800c00e, // jump #0x01c (PC + 0x01c)
PO_CALLR_R6 = 0x50a6c000, // indirect call: callr r6
PO_TFR_IMM = 0x78000000, // transfer immed
// ICLASS 0x7 - S2-type A-type
PO_IMMEXT = 0x00000000, // constant extender
PO_ALLOCFRAME_0 = 0xa09dc000, // allocframe(#0)
PO_DEALLOCFRAME = 0x901ec01e, // deallocframe
};
enum PacketWordParseBits : uint32_t {
@ -92,43 +94,61 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
//
// .L_xray_sled_N:
// <xray_sled_base>:
// { jump .Ltmp0 }
// { nop
// nop
// nop
// nop }
// { jump .Ltmp0 }
// { nop } x 6
// .Ltmp0:
//
// With the following runtime patch:
//
// xray_sled_n (32-bit):
//
// <xray_sled_n>:
// { immext(#...) // upper 26-bits of func id
// r7 = ##... // lower 6-bits of func id
// immext(#...) // upper 26-bits of trampoline
// r6 = ##... } // lower 6 bits of trampoline
// { callr r6 }
// { allocframe(#0) }
// { immext(#...) // upper 26-bits of func id
// r7 = ##... // lower 6-bits of func id
// immext(#...) // upper 26-bits of trampoline
// r6 = ##... } // lower 6-bits of trampoline
// { callr r6 }
// { deallocframe }
//
// allocframe(#0) saves the caller's r31:30 (LR:FP) before the callr
// clobbers r31, and deallocframe restores them afterward. This ensures
// the instrumented function's allocframe later saves the correct return
// address.
//
// Replacement of the first 4-byte instruction should be the last and
// atomic operation, so that user code reaching the sled concurrently
// either jumps over the whole sled, or executes the whole sled when it
// is ready.
//
// When |Enable|==false, we set back the first instruction in the sled to be
// { jump .Ltmp0 }
// { jump .Ltmp0 }
uint32_t *FirstAddress = reinterpret_cast<uint32_t *>(Sled.address());
if (Enable) {
uint32_t *CurAddress = FirstAddress + 1;
// Word 1: immext for r7 = FuncId
*CurAddress = encodeConstantExtender(FuncId);
CurAddress++;
// Word 2: r7 = ##FuncId (low 6 bits)
*CurAddress = encodeExtendedTransferImmediate(FuncId, RN_R7);
CurAddress++;
*CurAddress = encodeConstantExtender(reinterpret_cast<uint32_t>(TracingHook));
CurAddress++;
// Word 3: immext for r6 = TracingHook
*CurAddress =
encodeExtendedTransferImmediate(reinterpret_cast<uint32_t>(TracingHook), RN_R6, true);
encodeConstantExtender(reinterpret_cast<uint32_t>(TracingHook));
CurAddress++;
// Word 4: r6 = ##TracingHook (low 6 bits), packet end
*CurAddress = encodeExtendedTransferImmediate(
reinterpret_cast<uint32_t>(TracingHook), RN_R6, true);
CurAddress++;
// Word 5: callr r6
*CurAddress = uint32_t(PO_CALLR_R6);
CurAddress++;
// Word 6: deallocframe
*CurAddress = uint32_t(PO_DEALLOCFRAME);
WriteInstFlushCache(FirstAddress, uint32_t(encodeConstantExtender(FuncId)));
// Word 0 (written last, atomically): allocframe(#0) replaces jump
WriteInstFlushCache(FirstAddress, uint32_t(PO_ALLOCFRAME_0));
} else {
WriteInstFlushCache(FirstAddress, uint32_t(PatchOpcodes::PO_JUMPI_14));
WriteInstFlushCache(FirstAddress, uint32_t(PO_JUMPI_1C));
}
return true;
}

View File

@ -56,7 +56,7 @@ static const int16_t cSledLength = 64;
#elif defined(__powerpc64__)
static const int16_t cSledLength = 8;
#elif defined(__hexagon__)
static const int16_t cSledLength = 20;
static const int16_t cSledLength = 28;
#elif defined(__riscv) && (__riscv_xlen == 64)
static const int16_t cSledLength = 68;
#elif defined(__riscv) && (__riscv_xlen == 32)

View File

@ -15,50 +15,86 @@
#include "../builtins/assembly.h"
#include "../sanitizer_common/sanitizer_asm.h"
// The patched sled sets:
// r7 = function ID
// r6 = trampoline address (used by callr, then dead)
// r31 = return address back to sled (set by callr)
//
// The sled wraps the callr with allocframe(#0)/deallocframe to preserve
// the caller's original r31:30 across the trampoline call.
.macro SAVE_REGISTERS
memw(sp+#0)=r0
memw(sp+#4)=r1
memw(sp+#8)=r2
memw(sp+#12)=r3
memw(sp+#16)=r4
// Allocate 32 bytes on the stack:
// sp+#0: r0 (parameter / return value)
// sp+#4: r1 (parameter / return value)
// sp+#8: r2 (parameter)
// sp+#12: r3 (parameter)
// sp+#16: r4 (parameter)
// sp+#20: r5 (parameter)
// sp+#24: r31 (return address back to sled's deallocframe)
// sp+#28: (padding for 8-byte alignment)
{
sp = add(sp, #-32)
}
memw(sp+#0) = r0
memw(sp+#4) = r1
memw(sp+#8) = r2
memw(sp+#12) = r3
memw(sp+#16) = r4
memw(sp+#20) = r5
memw(sp+#24) = r31
.endm
.macro RESTORE_REGISTERS
r0=memw(sp+#0)
r1=memw(sp+#4)
r2=memw(sp+#8)
r3=memw(sp+#12)
r4=memw(sp+#16)
r0 = memw(sp+#0)
r1 = memw(sp+#4)
r2 = memw(sp+#8)
r3 = memw(sp+#12)
r4 = memw(sp+#16)
r5 = memw(sp+#20)
r31 = memw(sp+#24)
{
sp = add(sp, #32)
}
.endm
.macro CALL_PATCHED_FUNC entry_type
// if (xray::XRayPatchedFunctionE != NULL)
// xray::XRayPatchedFunctionE(FuncType);
r8 = #ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
// The patched sled puts the function type
// into r6. Move it into r0 to pass it to
// the patched function.
{ r0 = r6
r1 = \entry_type
p0 = !cmp.eq(r8, #0)
if (p0) callr r8 }
// Load the address of the global handler function pointer,
// then dereference it to get the actual handler.
r8 = ##ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
{
r8 = memw(r8+#0)
}
// Skip if handler is not registered (null).
{
p0 = cmp.eq(r8, #0)
if (p0.new) jump:nt 0f
}
// Set up arguments for the handler:
// r0 = FuncId (was placed in r7 by the patched sled)
// r1 = entry type (ENTRY=0, EXIT=1, TAIL=2)
{
r0 = r7
r1 = \entry_type
}
{
callr r8
}
0:
.endm
.text
.globl ASM_SYMBOL(__xray_FunctionEntry)
ASM_HIDDEN(__xray_FunctionEntry)
ASM_TYPE_FUNCTION(__xray_FunctionEntry)
# LLVM-MCA-BEGIN __xray_FunctionEntry
ASM_SYMBOL(__xray_FunctionEntry):
CFI_STARTPROC
SAVE_REGISTERS
CALL_PATCHED_FUNC #0 // XRayEntryType::ENTRY
.Ltmp0:
RESTORE_REGISTERS
// return
# LLVM-MCA-END
jumpr r31
ASM_SIZE(__xray_FunctionEntry)
CFI_ENDPROC
@ -66,17 +102,14 @@ ASM_SYMBOL(__xray_FunctionEntry):
.globl ASM_SYMBOL(__xray_FunctionExit)
ASM_HIDDEN(__xray_FunctionExit)
ASM_TYPE_FUNCTION(__xray_FunctionExit)
# LLVM-MCA-BEGIN __xray_FunctionExit
ASM_SYMBOL(__xray_FunctionExit):
CFI_STARTPROC
SAVE_REGISTERS
CALL_PATCHED_FUNC #1 // XRayEntryType::EXIT
.Ltmp1:
RESTORE_REGISTERS
// return
jumpr r31
# LLVM-MCA-END
ASM_SIZE(__xray_FunctionExit)
CFI_ENDPROC
@ -84,16 +117,15 @@ ASM_SYMBOL(__xray_FunctionExit):
.globl ASM_SYMBOL(__xray_FunctionTailExit)
ASM_HIDDEN(__xray_FunctionTailExit)
ASM_TYPE_FUNCTION(__xray_FunctionTailExit)
# LLVM-MCA-BEGIN __xray_FunctionTailExit
ASM_SYMBOL(__xray_FunctionTailExit):
CFI_STARTPROC
SAVE_REGISTERS
CALL_PATCHED_FUNC #2 // XRayEntryType::TAIL
.Ltmp2:
RESTORE_REGISTERS
// return
jumpr r31
# LLVM-MCA-END
ASM_SIZE(__xray_FunctionTailExit)
CFI_ENDPROC
NO_EXEC_STACK_DIRECTIVE

View File

@ -802,28 +802,35 @@ void HexagonAsmPrinter::emitAttributes() {
}
void HexagonAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) {
static const int8_t NoopsInSledCount = 4;
static const int8_t NoopsInSledCount = 6;
// We want to emit the following pattern:
//
// .L_xray_sled_N:
// <xray_sled_base>:
// { jump .Ltmp0 }
// { nop
// nop
// nop
// nop }
// { jump .Ltmp0 }
// { nop }
// { nop }
// { nop }
// { nop }
// { nop }
// { nop }
// .Ltmp0:
//
// We need the 4 nop words because at runtime, we'd be patching over the
// full 5 words with the following pattern:
// We need the 6 nop words because at runtime, we'd be patching over the
// full 7 words with the following pattern:
//
// <xray_sled_n>:
// { immext(#...) // upper 26-bits of trampoline
// r6 = ##... // lower 6-bits of trampoline
// immext(#...) // upper 26-bits of func id
// r7 = ##... } // lower 6 bits of func id
// { callr r6 }
// { allocframe(#0) }
// { immext(#...) // upper 26-bits of func id
// r7 = ##... // lower 6-bits of func id
// immext(#...) // upper 26-bits of trampoline
// r6 = ##... } // lower 6-bits of trampoline
// { callr r6 }
// { deallocframe }
//
// allocframe saves r31:30 (LR:FP) before the call, and deallocframe
// restores them after the trampoline returns, ensuring the caller's
// return address in r31 is preserved across the sled.
//
auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
OutStreamer->emitLabel(CurSled);

View File

@ -4,14 +4,14 @@ define void @Foo(i32 signext %a, i32 signext %b) #0 {
; CHECK-LABEL: @Foo
; CHECK-LABEL: .Lxray_sled_0:
; CHECK: jump .Ltmp0
; CHECK-COUNT-4: nop
; CHECK-COUNT-6: nop
entry:
%cmp = icmp sgt i32 %a, %b
br i1 %cmp, label %return, label %if.end
; CHECK-LABEL: .Lxray_sled_1:
; CHECK: jump .Ltmp1
; CHECK-COUNT-4: nop
; CHECK-COUNT-6: nop
; CHECK-LABEL: .Ltmp1:
; CHECK: if (p0) jumpr:nt r31
if.end:

View File

@ -8,6 +8,8 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always"
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: .Ltmp
; CHECK-SAME: [[#l]]:
ret i32 0
@ -17,6 +19,8 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always"
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: .Ltmp
; CHECK-SAME: [[#l]]:
; CHECK: jumpr r31