From 5e7c66013bce8916d976355f5455e771d3bbcff4 Mon Sep 17 00:00:00 2001 From: Brian Cain Date: Mon, 30 Mar 2026 09:25:14 -0500 Subject: [PATCH] [Hexagon][XRay] Fix sled layout and trampoline to preserve retaddr (#188784) The Hexagon XRay sled was 5 words (20 bytes) and the patched sequence clobbered r31 (the link register) via callr without saving it first. When the trampoline returned, the instrumented function's own allocframe would then save the wrong return address, causing a crash or misrouted return. Expand the sled to 7 words (28 bytes) and wrap the call with allocframe(#0)/deallocframe so the caller's r31:30 are preserved across the trampoline call. Detailed fixes: - HexagonAsmPrinter: emit 6 nop words after the jump (7 words total) - xray_hexagon.cpp: patch allocframe(#0) as first word, immext+r7 (func ID), immext+r6 (trampoline), callr r6, deallocframe; write the first word last for atomicity - xray_trampoline_hexagon.S: complete rewrite -- properly load and dereference the global handler pointer, save/restore r0-r5 and r31, add stack frame with correct 8-byte alignment, add jumpr r31 to actually return from trampolines - xray_interface.cpp: update Hexagon cSledLength from 20 to 28 - Update lit tests for 6-nop sled --- compiler-rt/lib/xray/xray_hexagon.cpp | 70 +++++++----- compiler-rt/lib/xray/xray_interface.cpp | 2 +- .../lib/xray/xray_trampoline_hexagon.S | 100 ++++++++++++------ llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp | 33 +++--- llvm/test/CodeGen/Hexagon/xray-pred-ret.ll | 4 +- llvm/test/CodeGen/Hexagon/xray.ll | 4 + 6 files changed, 138 insertions(+), 75 deletions(-) diff --git a/compiler-rt/lib/xray/xray_hexagon.cpp b/compiler-rt/lib/xray/xray_hexagon.cpp index 1c07bb046903..b299f03d6daa 100644 --- a/compiler-rt/lib/xray/xray_hexagon.cpp +++ b/compiler-rt/lib/xray/xray_hexagon.cpp @@ -21,11 +21,13 @@ namespace __xray { // The machine codes for some instructions used in runtime patching. enum PatchOpcodes : uint32_t { - PO_JUMPI_14 = 0x5800c00a, // jump #0x014 (PC + 0x014) - PO_CALLR_R6 = 0x50a6c000, // indirect call: callr r6 - PO_TFR_IMM = 0x78000000, // transfer immed - // ICLASS 0x7 - S2-type A-type - PO_IMMEXT = 0x00000000, // constant extender + PO_JUMPI_1C = 0x5800c00e, // jump #0x01c (PC + 0x01c) + PO_CALLR_R6 = 0x50a6c000, // indirect call: callr r6 + PO_TFR_IMM = 0x78000000, // transfer immed + // ICLASS 0x7 - S2-type A-type + PO_IMMEXT = 0x00000000, // constant extender + PO_ALLOCFRAME_0 = 0xa09dc000, // allocframe(#0) + PO_DEALLOCFRAME = 0x901ec01e, // deallocframe }; enum PacketWordParseBits : uint32_t { @@ -92,43 +94,61 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId, // // .L_xray_sled_N: // : - // { jump .Ltmp0 } - // { nop - // nop - // nop - // nop } + // { jump .Ltmp0 } + // { nop } x 6 // .Ltmp0: - + // // With the following runtime patch: // - // xray_sled_n (32-bit): - // // : - // { immext(#...) // upper 26-bits of func id - // r7 = ##... // lower 6-bits of func id - // immext(#...) // upper 26-bits of trampoline - // r6 = ##... } // lower 6 bits of trampoline - // { callr r6 } + // { allocframe(#0) } + // { immext(#...) // upper 26-bits of func id + // r7 = ##... // lower 6-bits of func id + // immext(#...) // upper 26-bits of trampoline + // r6 = ##... } // lower 6-bits of trampoline + // { callr r6 } + // { deallocframe } + // + // allocframe(#0) saves the caller's r31:30 (LR:FP) before the callr + // clobbers r31, and deallocframe restores them afterward. This ensures + // the instrumented function's allocframe later saves the correct return + // address. + // + // Replacement of the first 4-byte instruction should be the last and + // atomic operation, so that user code reaching the sled concurrently + // either jumps over the whole sled, or executes the whole sled when it + // is ready. // // When |Enable|==false, we set back the first instruction in the sled to be - // { jump .Ltmp0 } + // { jump .Ltmp0 } uint32_t *FirstAddress = reinterpret_cast(Sled.address()); if (Enable) { uint32_t *CurAddress = FirstAddress + 1; + // Word 1: immext for r7 = FuncId + *CurAddress = encodeConstantExtender(FuncId); + CurAddress++; + // Word 2: r7 = ##FuncId (low 6 bits) *CurAddress = encodeExtendedTransferImmediate(FuncId, RN_R7); CurAddress++; - *CurAddress = encodeConstantExtender(reinterpret_cast(TracingHook)); - CurAddress++; + // Word 3: immext for r6 = TracingHook *CurAddress = - encodeExtendedTransferImmediate(reinterpret_cast(TracingHook), RN_R6, true); + encodeConstantExtender(reinterpret_cast(TracingHook)); CurAddress++; - + // Word 4: r6 = ##TracingHook (low 6 bits), packet end + *CurAddress = encodeExtendedTransferImmediate( + reinterpret_cast(TracingHook), RN_R6, true); + CurAddress++; + // Word 5: callr r6 *CurAddress = uint32_t(PO_CALLR_R6); + CurAddress++; + // Word 6: deallocframe + *CurAddress = uint32_t(PO_DEALLOCFRAME); - WriteInstFlushCache(FirstAddress, uint32_t(encodeConstantExtender(FuncId))); + // Word 0 (written last, atomically): allocframe(#0) replaces jump + WriteInstFlushCache(FirstAddress, uint32_t(PO_ALLOCFRAME_0)); } else { - WriteInstFlushCache(FirstAddress, uint32_t(PatchOpcodes::PO_JUMPI_14)); + WriteInstFlushCache(FirstAddress, uint32_t(PO_JUMPI_1C)); } return true; } diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp index 9bf0c56c4521..31aaf4916b78 100644 --- a/compiler-rt/lib/xray/xray_interface.cpp +++ b/compiler-rt/lib/xray/xray_interface.cpp @@ -56,7 +56,7 @@ static const int16_t cSledLength = 64; #elif defined(__powerpc64__) static const int16_t cSledLength = 8; #elif defined(__hexagon__) -static const int16_t cSledLength = 20; +static const int16_t cSledLength = 28; #elif defined(__riscv) && (__riscv_xlen == 64) static const int16_t cSledLength = 68; #elif defined(__riscv) && (__riscv_xlen == 32) diff --git a/compiler-rt/lib/xray/xray_trampoline_hexagon.S b/compiler-rt/lib/xray/xray_trampoline_hexagon.S index c87ec4bed1f9..a7ba7c6b889a 100644 --- a/compiler-rt/lib/xray/xray_trampoline_hexagon.S +++ b/compiler-rt/lib/xray/xray_trampoline_hexagon.S @@ -15,50 +15,86 @@ #include "../builtins/assembly.h" #include "../sanitizer_common/sanitizer_asm.h" +// The patched sled sets: +// r7 = function ID +// r6 = trampoline address (used by callr, then dead) +// r31 = return address back to sled (set by callr) +// +// The sled wraps the callr with allocframe(#0)/deallocframe to preserve +// the caller's original r31:30 across the trampoline call. + .macro SAVE_REGISTERS -memw(sp+#0)=r0 -memw(sp+#4)=r1 -memw(sp+#8)=r2 -memw(sp+#12)=r3 -memw(sp+#16)=r4 + // Allocate 32 bytes on the stack: + // sp+#0: r0 (parameter / return value) + // sp+#4: r1 (parameter / return value) + // sp+#8: r2 (parameter) + // sp+#12: r3 (parameter) + // sp+#16: r4 (parameter) + // sp+#20: r5 (parameter) + // sp+#24: r31 (return address back to sled's deallocframe) + // sp+#28: (padding for 8-byte alignment) + { + sp = add(sp, #-32) + } + memw(sp+#0) = r0 + memw(sp+#4) = r1 + memw(sp+#8) = r2 + memw(sp+#12) = r3 + memw(sp+#16) = r4 + memw(sp+#20) = r5 + memw(sp+#24) = r31 .endm + .macro RESTORE_REGISTERS -r0=memw(sp+#0) -r1=memw(sp+#4) -r2=memw(sp+#8) -r3=memw(sp+#12) -r4=memw(sp+#16) + r0 = memw(sp+#0) + r1 = memw(sp+#4) + r2 = memw(sp+#8) + r3 = memw(sp+#12) + r4 = memw(sp+#16) + r5 = memw(sp+#20) + r31 = memw(sp+#24) + { + sp = add(sp, #32) + } .endm .macro CALL_PATCHED_FUNC entry_type - // if (xray::XRayPatchedFunctionE != NULL) - // xray::XRayPatchedFunctionE(FuncType); - - r8 = #ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE) - - // The patched sled puts the function type - // into r6. Move it into r0 to pass it to - // the patched function. - { r0 = r6 - r1 = \entry_type - p0 = !cmp.eq(r8, #0) - if (p0) callr r8 } + // Load the address of the global handler function pointer, + // then dereference it to get the actual handler. + r8 = ##ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE) + { + r8 = memw(r8+#0) + } + // Skip if handler is not registered (null). + { + p0 = cmp.eq(r8, #0) + if (p0.new) jump:nt 0f + } + // Set up arguments for the handler: + // r0 = FuncId (was placed in r7 by the patched sled) + // r1 = entry type (ENTRY=0, EXIT=1, TAIL=2) + { + r0 = r7 + r1 = \entry_type + } + { + callr r8 + } +0: .endm .text .globl ASM_SYMBOL(__xray_FunctionEntry) ASM_HIDDEN(__xray_FunctionEntry) ASM_TYPE_FUNCTION(__xray_FunctionEntry) -# LLVM-MCA-BEGIN __xray_FunctionEntry ASM_SYMBOL(__xray_FunctionEntry): CFI_STARTPROC SAVE_REGISTERS CALL_PATCHED_FUNC #0 // XRayEntryType::ENTRY -.Ltmp0: + RESTORE_REGISTERS - // return -# LLVM-MCA-END + jumpr r31 ASM_SIZE(__xray_FunctionEntry) CFI_ENDPROC @@ -66,17 +102,14 @@ ASM_SYMBOL(__xray_FunctionEntry): .globl ASM_SYMBOL(__xray_FunctionExit) ASM_HIDDEN(__xray_FunctionExit) ASM_TYPE_FUNCTION(__xray_FunctionExit) -# LLVM-MCA-BEGIN __xray_FunctionExit ASM_SYMBOL(__xray_FunctionExit): CFI_STARTPROC SAVE_REGISTERS CALL_PATCHED_FUNC #1 // XRayEntryType::EXIT -.Ltmp1: + RESTORE_REGISTERS - // return jumpr r31 -# LLVM-MCA-END ASM_SIZE(__xray_FunctionExit) CFI_ENDPROC @@ -84,16 +117,15 @@ ASM_SYMBOL(__xray_FunctionExit): .globl ASM_SYMBOL(__xray_FunctionTailExit) ASM_HIDDEN(__xray_FunctionTailExit) ASM_TYPE_FUNCTION(__xray_FunctionTailExit) -# LLVM-MCA-BEGIN __xray_FunctionTailExit ASM_SYMBOL(__xray_FunctionTailExit): CFI_STARTPROC SAVE_REGISTERS CALL_PATCHED_FUNC #2 // XRayEntryType::TAIL -.Ltmp2: + RESTORE_REGISTERS - // return jumpr r31 -# LLVM-MCA-END ASM_SIZE(__xray_FunctionTailExit) CFI_ENDPROC + +NO_EXEC_STACK_DIRECTIVE diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp index 37a44d6033b8..a83a221b1a9b 100644 --- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -802,28 +802,35 @@ void HexagonAsmPrinter::emitAttributes() { } void HexagonAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) { - static const int8_t NoopsInSledCount = 4; + static const int8_t NoopsInSledCount = 6; // We want to emit the following pattern: // // .L_xray_sled_N: // : - // { jump .Ltmp0 } - // { nop - // nop - // nop - // nop } + // { jump .Ltmp0 } + // { nop } + // { nop } + // { nop } + // { nop } + // { nop } + // { nop } // .Ltmp0: // - // We need the 4 nop words because at runtime, we'd be patching over the - // full 5 words with the following pattern: + // We need the 6 nop words because at runtime, we'd be patching over the + // full 7 words with the following pattern: // // : - // { immext(#...) // upper 26-bits of trampoline - // r6 = ##... // lower 6-bits of trampoline - // immext(#...) // upper 26-bits of func id - // r7 = ##... } // lower 6 bits of func id - // { callr r6 } + // { allocframe(#0) } + // { immext(#...) // upper 26-bits of func id + // r7 = ##... // lower 6-bits of func id + // immext(#...) // upper 26-bits of trampoline + // r6 = ##... } // lower 6-bits of trampoline + // { callr r6 } + // { deallocframe } // + // allocframe saves r31:30 (LR:FP) before the call, and deallocframe + // restores them after the trampoline returns, ensuring the caller's + // return address in r31 is preserved across the sled. // auto CurSled = OutContext.createTempSymbol("xray_sled_", true); OutStreamer->emitLabel(CurSled); diff --git a/llvm/test/CodeGen/Hexagon/xray-pred-ret.ll b/llvm/test/CodeGen/Hexagon/xray-pred-ret.ll index 306a00fc298e..e19927271175 100644 --- a/llvm/test/CodeGen/Hexagon/xray-pred-ret.ll +++ b/llvm/test/CodeGen/Hexagon/xray-pred-ret.ll @@ -4,14 +4,14 @@ define void @Foo(i32 signext %a, i32 signext %b) #0 { ; CHECK-LABEL: @Foo ; CHECK-LABEL: .Lxray_sled_0: ; CHECK: jump .Ltmp0 -; CHECK-COUNT-4: nop +; CHECK-COUNT-6: nop entry: %cmp = icmp sgt i32 %a, %b br i1 %cmp, label %return, label %if.end ; CHECK-LABEL: .Lxray_sled_1: ; CHECK: jump .Ltmp1 -; CHECK-COUNT-4: nop +; CHECK-COUNT-6: nop ; CHECK-LABEL: .Ltmp1: ; CHECK: if (p0) jumpr:nt r31 if.end: diff --git a/llvm/test/CodeGen/Hexagon/xray.ll b/llvm/test/CodeGen/Hexagon/xray.ll index 5a600e1e0327..67368f651893 100644 --- a/llvm/test/CodeGen/Hexagon/xray.ll +++ b/llvm/test/CodeGen/Hexagon/xray.ll @@ -8,6 +8,8 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always" ; CHECK: nop ; CHECK: nop ; CHECK: nop +; CHECK: nop +; CHECK: nop ; CHECK: .Ltmp ; CHECK-SAME: [[#l]]: ret i32 0 @@ -17,6 +19,8 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always" ; CHECK: nop ; CHECK: nop ; CHECK: nop +; CHECK: nop +; CHECK: nop ; CHECK: .Ltmp ; CHECK-SAME: [[#l]]: ; CHECK: jumpr r31