diff --git a/compiler-rt/lib/xray/xray_hexagon.cpp b/compiler-rt/lib/xray/xray_hexagon.cpp index 1c07bb046903..b299f03d6daa 100644 --- a/compiler-rt/lib/xray/xray_hexagon.cpp +++ b/compiler-rt/lib/xray/xray_hexagon.cpp @@ -21,11 +21,13 @@ namespace __xray { // The machine codes for some instructions used in runtime patching. enum PatchOpcodes : uint32_t { - PO_JUMPI_14 = 0x5800c00a, // jump #0x014 (PC + 0x014) - PO_CALLR_R6 = 0x50a6c000, // indirect call: callr r6 - PO_TFR_IMM = 0x78000000, // transfer immed - // ICLASS 0x7 - S2-type A-type - PO_IMMEXT = 0x00000000, // constant extender + PO_JUMPI_1C = 0x5800c00e, // jump #0x01c (PC + 0x01c) + PO_CALLR_R6 = 0x50a6c000, // indirect call: callr r6 + PO_TFR_IMM = 0x78000000, // transfer immed + // ICLASS 0x7 - S2-type A-type + PO_IMMEXT = 0x00000000, // constant extender + PO_ALLOCFRAME_0 = 0xa09dc000, // allocframe(#0) + PO_DEALLOCFRAME = 0x901ec01e, // deallocframe }; enum PacketWordParseBits : uint32_t { @@ -92,43 +94,61 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId, // // .L_xray_sled_N: // : - // { jump .Ltmp0 } - // { nop - // nop - // nop - // nop } + // { jump .Ltmp0 } + // { nop } x 6 // .Ltmp0: - + // // With the following runtime patch: // - // xray_sled_n (32-bit): - // // : - // { immext(#...) // upper 26-bits of func id - // r7 = ##... // lower 6-bits of func id - // immext(#...) // upper 26-bits of trampoline - // r6 = ##... } // lower 6 bits of trampoline - // { callr r6 } + // { allocframe(#0) } + // { immext(#...) // upper 26-bits of func id + // r7 = ##... // lower 6-bits of func id + // immext(#...) // upper 26-bits of trampoline + // r6 = ##... } // lower 6-bits of trampoline + // { callr r6 } + // { deallocframe } + // + // allocframe(#0) saves the caller's r31:30 (LR:FP) before the callr + // clobbers r31, and deallocframe restores them afterward. This ensures + // the instrumented function's allocframe later saves the correct return + // address. + // + // Replacement of the first 4-byte instruction should be the last and + // atomic operation, so that user code reaching the sled concurrently + // either jumps over the whole sled, or executes the whole sled when it + // is ready. // // When |Enable|==false, we set back the first instruction in the sled to be - // { jump .Ltmp0 } + // { jump .Ltmp0 } uint32_t *FirstAddress = reinterpret_cast(Sled.address()); if (Enable) { uint32_t *CurAddress = FirstAddress + 1; + // Word 1: immext for r7 = FuncId + *CurAddress = encodeConstantExtender(FuncId); + CurAddress++; + // Word 2: r7 = ##FuncId (low 6 bits) *CurAddress = encodeExtendedTransferImmediate(FuncId, RN_R7); CurAddress++; - *CurAddress = encodeConstantExtender(reinterpret_cast(TracingHook)); - CurAddress++; + // Word 3: immext for r6 = TracingHook *CurAddress = - encodeExtendedTransferImmediate(reinterpret_cast(TracingHook), RN_R6, true); + encodeConstantExtender(reinterpret_cast(TracingHook)); CurAddress++; - + // Word 4: r6 = ##TracingHook (low 6 bits), packet end + *CurAddress = encodeExtendedTransferImmediate( + reinterpret_cast(TracingHook), RN_R6, true); + CurAddress++; + // Word 5: callr r6 *CurAddress = uint32_t(PO_CALLR_R6); + CurAddress++; + // Word 6: deallocframe + *CurAddress = uint32_t(PO_DEALLOCFRAME); - WriteInstFlushCache(FirstAddress, uint32_t(encodeConstantExtender(FuncId))); + // Word 0 (written last, atomically): allocframe(#0) replaces jump + WriteInstFlushCache(FirstAddress, uint32_t(PO_ALLOCFRAME_0)); } else { - WriteInstFlushCache(FirstAddress, uint32_t(PatchOpcodes::PO_JUMPI_14)); + WriteInstFlushCache(FirstAddress, uint32_t(PO_JUMPI_1C)); } return true; } diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp index 9bf0c56c4521..31aaf4916b78 100644 --- a/compiler-rt/lib/xray/xray_interface.cpp +++ b/compiler-rt/lib/xray/xray_interface.cpp @@ -56,7 +56,7 @@ static const int16_t cSledLength = 64; #elif defined(__powerpc64__) static const int16_t cSledLength = 8; #elif defined(__hexagon__) -static const int16_t cSledLength = 20; +static const int16_t cSledLength = 28; #elif defined(__riscv) && (__riscv_xlen == 64) static const int16_t cSledLength = 68; #elif defined(__riscv) && (__riscv_xlen == 32) diff --git a/compiler-rt/lib/xray/xray_trampoline_hexagon.S b/compiler-rt/lib/xray/xray_trampoline_hexagon.S index c87ec4bed1f9..a7ba7c6b889a 100644 --- a/compiler-rt/lib/xray/xray_trampoline_hexagon.S +++ b/compiler-rt/lib/xray/xray_trampoline_hexagon.S @@ -15,50 +15,86 @@ #include "../builtins/assembly.h" #include "../sanitizer_common/sanitizer_asm.h" +// The patched sled sets: +// r7 = function ID +// r6 = trampoline address (used by callr, then dead) +// r31 = return address back to sled (set by callr) +// +// The sled wraps the callr with allocframe(#0)/deallocframe to preserve +// the caller's original r31:30 across the trampoline call. + .macro SAVE_REGISTERS -memw(sp+#0)=r0 -memw(sp+#4)=r1 -memw(sp+#8)=r2 -memw(sp+#12)=r3 -memw(sp+#16)=r4 + // Allocate 32 bytes on the stack: + // sp+#0: r0 (parameter / return value) + // sp+#4: r1 (parameter / return value) + // sp+#8: r2 (parameter) + // sp+#12: r3 (parameter) + // sp+#16: r4 (parameter) + // sp+#20: r5 (parameter) + // sp+#24: r31 (return address back to sled's deallocframe) + // sp+#28: (padding for 8-byte alignment) + { + sp = add(sp, #-32) + } + memw(sp+#0) = r0 + memw(sp+#4) = r1 + memw(sp+#8) = r2 + memw(sp+#12) = r3 + memw(sp+#16) = r4 + memw(sp+#20) = r5 + memw(sp+#24) = r31 .endm + .macro RESTORE_REGISTERS -r0=memw(sp+#0) -r1=memw(sp+#4) -r2=memw(sp+#8) -r3=memw(sp+#12) -r4=memw(sp+#16) + r0 = memw(sp+#0) + r1 = memw(sp+#4) + r2 = memw(sp+#8) + r3 = memw(sp+#12) + r4 = memw(sp+#16) + r5 = memw(sp+#20) + r31 = memw(sp+#24) + { + sp = add(sp, #32) + } .endm .macro CALL_PATCHED_FUNC entry_type - // if (xray::XRayPatchedFunctionE != NULL) - // xray::XRayPatchedFunctionE(FuncType); - - r8 = #ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE) - - // The patched sled puts the function type - // into r6. Move it into r0 to pass it to - // the patched function. - { r0 = r6 - r1 = \entry_type - p0 = !cmp.eq(r8, #0) - if (p0) callr r8 } + // Load the address of the global handler function pointer, + // then dereference it to get the actual handler. + r8 = ##ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE) + { + r8 = memw(r8+#0) + } + // Skip if handler is not registered (null). + { + p0 = cmp.eq(r8, #0) + if (p0.new) jump:nt 0f + } + // Set up arguments for the handler: + // r0 = FuncId (was placed in r7 by the patched sled) + // r1 = entry type (ENTRY=0, EXIT=1, TAIL=2) + { + r0 = r7 + r1 = \entry_type + } + { + callr r8 + } +0: .endm .text .globl ASM_SYMBOL(__xray_FunctionEntry) ASM_HIDDEN(__xray_FunctionEntry) ASM_TYPE_FUNCTION(__xray_FunctionEntry) -# LLVM-MCA-BEGIN __xray_FunctionEntry ASM_SYMBOL(__xray_FunctionEntry): CFI_STARTPROC SAVE_REGISTERS CALL_PATCHED_FUNC #0 // XRayEntryType::ENTRY -.Ltmp0: + RESTORE_REGISTERS - // return -# LLVM-MCA-END + jumpr r31 ASM_SIZE(__xray_FunctionEntry) CFI_ENDPROC @@ -66,17 +102,14 @@ ASM_SYMBOL(__xray_FunctionEntry): .globl ASM_SYMBOL(__xray_FunctionExit) ASM_HIDDEN(__xray_FunctionExit) ASM_TYPE_FUNCTION(__xray_FunctionExit) -# LLVM-MCA-BEGIN __xray_FunctionExit ASM_SYMBOL(__xray_FunctionExit): CFI_STARTPROC SAVE_REGISTERS CALL_PATCHED_FUNC #1 // XRayEntryType::EXIT -.Ltmp1: + RESTORE_REGISTERS - // return jumpr r31 -# LLVM-MCA-END ASM_SIZE(__xray_FunctionExit) CFI_ENDPROC @@ -84,16 +117,15 @@ ASM_SYMBOL(__xray_FunctionExit): .globl ASM_SYMBOL(__xray_FunctionTailExit) ASM_HIDDEN(__xray_FunctionTailExit) ASM_TYPE_FUNCTION(__xray_FunctionTailExit) -# LLVM-MCA-BEGIN __xray_FunctionTailExit ASM_SYMBOL(__xray_FunctionTailExit): CFI_STARTPROC SAVE_REGISTERS CALL_PATCHED_FUNC #2 // XRayEntryType::TAIL -.Ltmp2: + RESTORE_REGISTERS - // return jumpr r31 -# LLVM-MCA-END ASM_SIZE(__xray_FunctionTailExit) CFI_ENDPROC + +NO_EXEC_STACK_DIRECTIVE diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp index 37a44d6033b8..a83a221b1a9b 100644 --- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -802,28 +802,35 @@ void HexagonAsmPrinter::emitAttributes() { } void HexagonAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) { - static const int8_t NoopsInSledCount = 4; + static const int8_t NoopsInSledCount = 6; // We want to emit the following pattern: // // .L_xray_sled_N: // : - // { jump .Ltmp0 } - // { nop - // nop - // nop - // nop } + // { jump .Ltmp0 } + // { nop } + // { nop } + // { nop } + // { nop } + // { nop } + // { nop } // .Ltmp0: // - // We need the 4 nop words because at runtime, we'd be patching over the - // full 5 words with the following pattern: + // We need the 6 nop words because at runtime, we'd be patching over the + // full 7 words with the following pattern: // // : - // { immext(#...) // upper 26-bits of trampoline - // r6 = ##... // lower 6-bits of trampoline - // immext(#...) // upper 26-bits of func id - // r7 = ##... } // lower 6 bits of func id - // { callr r6 } + // { allocframe(#0) } + // { immext(#...) // upper 26-bits of func id + // r7 = ##... // lower 6-bits of func id + // immext(#...) // upper 26-bits of trampoline + // r6 = ##... } // lower 6-bits of trampoline + // { callr r6 } + // { deallocframe } // + // allocframe saves r31:30 (LR:FP) before the call, and deallocframe + // restores them after the trampoline returns, ensuring the caller's + // return address in r31 is preserved across the sled. // auto CurSled = OutContext.createTempSymbol("xray_sled_", true); OutStreamer->emitLabel(CurSled); diff --git a/llvm/test/CodeGen/Hexagon/xray-pred-ret.ll b/llvm/test/CodeGen/Hexagon/xray-pred-ret.ll index 306a00fc298e..e19927271175 100644 --- a/llvm/test/CodeGen/Hexagon/xray-pred-ret.ll +++ b/llvm/test/CodeGen/Hexagon/xray-pred-ret.ll @@ -4,14 +4,14 @@ define void @Foo(i32 signext %a, i32 signext %b) #0 { ; CHECK-LABEL: @Foo ; CHECK-LABEL: .Lxray_sled_0: ; CHECK: jump .Ltmp0 -; CHECK-COUNT-4: nop +; CHECK-COUNT-6: nop entry: %cmp = icmp sgt i32 %a, %b br i1 %cmp, label %return, label %if.end ; CHECK-LABEL: .Lxray_sled_1: ; CHECK: jump .Ltmp1 -; CHECK-COUNT-4: nop +; CHECK-COUNT-6: nop ; CHECK-LABEL: .Ltmp1: ; CHECK: if (p0) jumpr:nt r31 if.end: diff --git a/llvm/test/CodeGen/Hexagon/xray.ll b/llvm/test/CodeGen/Hexagon/xray.ll index 5a600e1e0327..67368f651893 100644 --- a/llvm/test/CodeGen/Hexagon/xray.ll +++ b/llvm/test/CodeGen/Hexagon/xray.ll @@ -8,6 +8,8 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always" ; CHECK: nop ; CHECK: nop ; CHECK: nop +; CHECK: nop +; CHECK: nop ; CHECK: .Ltmp ; CHECK-SAME: [[#l]]: ret i32 0 @@ -17,6 +19,8 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always" ; CHECK: nop ; CHECK: nop ; CHECK: nop +; CHECK: nop +; CHECK: nop ; CHECK: .Ltmp ; CHECK-SAME: [[#l]]: ; CHECK: jumpr r31