From da34f9bb45243d5338e6e0b69413e9e165470053 Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Fri, 16 Jan 2026 15:36:47 -0800 Subject: [PATCH] [AArch64] Fix Windows prologue handling to pair more registers. (#170214) Currently, there's code to suppress pairing, but we don't actually need to suppress that; we just need to suppress the formation of pre-decrement/post-increment instructions. Pairing saves an instruction in some cases, and enables packed unwind in some cases. --- .../Target/AArch64/AArch64FrameLowering.cpp | 25 ++++------- .../AArch64/AArch64PrologueEpilogue.cpp | 14 +++++- .../CodeGen/AArch64/arm64-windows-calls.ll | 44 +++++++++---------- llvm/test/CodeGen/AArch64/win64_vararg2.ll | 24 ++++------ llvm/test/CodeGen/AArch64/wineh-pac.ll | 32 +++++++------- .../CodeGen/AArch64/wineh-save-lrpair2.mir | 12 ++--- .../COFF/AArch64/arm64-register-variables.ll | 24 +++++----- 7 files changed, 84 insertions(+), 91 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index f130912c4c1b..284b6074abb9 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1560,7 +1560,6 @@ static bool produceCompactUnwindFrame(const AArch64FrameLowering &AFL, static bool invalidateWindowsRegisterPairing(bool SpillExtendedVolatile, unsigned SpillCount, unsigned Reg1, unsigned Reg2, bool NeedsWinCFI, - bool IsFirst, const TargetRegisterInfo *TRI) { // If we are generating register pairs for a Windows function that requires // EH support, then pair consecutive registers only. There are no unwind @@ -1586,12 +1585,9 @@ static bool invalidateWindowsRegisterPairing(bool SpillExtendedVolatile, : false; // If pairing a GPR with LR, the pair can be described by the save_lrpair - // opcode. If this is the first register pair, it would end up with a - // predecrement, but there's no save_lrpair_x opcode, so we can only do this - // if LR is paired with something else than the first register. - // The save_lrpair opcode requires the first register to be an odd one. + // opcode. The save_lrpair opcode requires the first register to be odd. if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 && - (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst) + (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR) return false; return true; } @@ -1604,12 +1600,10 @@ static bool invalidateRegisterPairing(bool SpillExtendedVolatile, unsigned SpillCount, unsigned Reg1, unsigned Reg2, bool UsesWinAAPCS, bool NeedsWinCFI, bool NeedsFrameRecord, - bool IsFirst, const TargetRegisterInfo *TRI) { if (UsesWinAAPCS) return invalidateWindowsRegisterPairing(SpillExtendedVolatile, SpillCount, - Reg1, Reg2, NeedsWinCFI, IsFirst, - TRI); + Reg1, Reg2, NeedsWinCFI, TRI); // If we need to store the frame record, don't pair any register // with LR other than FP. @@ -1779,21 +1773,20 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, // Add the next reg to the pair if it is in the same register class. if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) { MCRegister NextReg = CSI[i + RegInc].getReg(); - bool IsFirst = i == FirstReg; unsigned SpillCount = NeedsWinCFI ? FirstReg - i : i; switch (RPI.Type) { case RegPairInfo::GPR: if (AArch64::GPR64RegClass.contains(NextReg) && - !invalidateRegisterPairing( - SpillExtendedVolatile, SpillCount, RPI.Reg1, NextReg, IsWindows, - NeedsWinCFI, NeedsFrameRecord, IsFirst, TRI)) + !invalidateRegisterPairing(SpillExtendedVolatile, SpillCount, + RPI.Reg1, NextReg, IsWindows, + NeedsWinCFI, NeedsFrameRecord, TRI)) RPI.Reg2 = NextReg; break; case RegPairInfo::FPR64: if (AArch64::FPR64RegClass.contains(NextReg) && - !invalidateRegisterPairing( - SpillExtendedVolatile, SpillCount, RPI.Reg1, NextReg, IsWindows, - NeedsWinCFI, NeedsFrameRecord, IsFirst, TRI)) + !invalidateRegisterPairing(SpillExtendedVolatile, SpillCount, + RPI.Reg1, NextReg, IsWindows, + NeedsWinCFI, NeedsFrameRecord, TRI)) RPI.Reg2 = NextReg; break; case RegPairInfo::FPR128: diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index 1008c5a3ef86..37a5f5b8b1c6 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -168,9 +168,16 @@ AArch64PrologueEpilogueCommon::convertCalleeSaveRestoreToSPPrePostIncDec( // If the first store isn't right where we want SP then we can't fold the // update in so create a normal arithmetic instruction instead. + // + // On Windows, some register pairs involving LR can't be folded because + // there isn't a corresponding unwind opcode. if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 || CSStackSizeInc < MinOffset * (int64_t)Scale.getFixedValue() || - CSStackSizeInc > MaxOffset * (int64_t)Scale.getFixedValue()) { + CSStackSizeInc > MaxOffset * (int64_t)Scale.getFixedValue() || + (NeedsWinCFI && + (NewOpc == AArch64::LDPXpost || NewOpc == AArch64::STPXpre) && + RegInfo.getEncodingValue(MBBI->getOperand(0).getReg()) + 1 != + RegInfo.getEncodingValue(MBBI->getOperand(1).getReg()))) { // If we are destroying the frame, make sure we add the increment after the // last frame operation. if (FrameFlag == MachineInstr::FrameDestroy) { @@ -310,6 +317,11 @@ bool AArch64PrologueEpilogueCommon::shouldCombineCSRLocalStackBump( // (to force a stp with predecrement) to match the packed unwind format, // provided that there actually are any callee saved registers to merge the // decrement with. + // + // Note that for certain paired saves, like "x19, lr", we can't actually + // emit an predecrement stp, but packed unwind still expects a separate stack + // adjustment. + // // This is potentially marginally slower, but allows using the packed // unwind format for functions that both have a local area and callee saved // registers. Using the packed unwind format notably reduces the size of diff --git a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll index 666f1cb7bcf6..41f00a6c41b3 100644 --- a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll +++ b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll @@ -143,10 +143,10 @@ define void @call_copy_pod() { ; CHECK-LABEL: call_copy_pod: ; CHECK: .seh_proc call_copy_pod ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: str x19, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .seh_save_reg_x x19, 16 -; CHECK-NEXT: str x30, [sp, #8] // 8-byte Spill -; CHECK-NEXT: .seh_save_reg x30, 8 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: stp x19, x30, [sp] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_lrpair x19, 0 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: adrp x19, Pod ; CHECK-NEXT: add x19, x19, :lo12:Pod @@ -154,10 +154,10 @@ define void @call_copy_pod() { ; CHECK-NEXT: bl copy_pod ; CHECK-NEXT: stp d0, d1, [x19] ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Reload -; CHECK-NEXT: .seh_save_reg x30, 8 -; CHECK-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg_x x19, 16 +; CHECK-NEXT: ldp x19, x30, [sp] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_lrpair x19, 0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: .seh_endepilogue ; CHECK-NEXT: ret ; CHECK-NEXT: .seh_endfunclet @@ -175,10 +175,8 @@ define void @call_copy_notcxx14aggregate() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .seh_stackalloc 32 -; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEXT: .seh_save_reg x19, 16 -; CHECK-NEXT: str x30, [sp, #24] // 8-byte Spill -; CHECK-NEXT: .seh_save_reg x30, 24 +; CHECK-NEXT: stp x19, x30, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_lrpair x19, 16 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: adrp x19, NotCXX14Aggregate ; CHECK-NEXT: add x19, x19, :lo12:NotCXX14Aggregate @@ -188,10 +186,8 @@ define void @call_copy_notcxx14aggregate() { ; CHECK-NEXT: ldp d0, d1, [sp] ; CHECK-NEXT: stp d0, d1, [x19] ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldr x30, [sp, #24] // 8-byte Reload -; CHECK-NEXT: .seh_save_reg x30, 24 -; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEXT: .seh_save_reg x19, 16 +; CHECK-NEXT: ldp x19, x30, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_lrpair x19, 16 ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: .seh_stackalloc 32 ; CHECK-NEXT: .seh_endepilogue @@ -211,10 +207,10 @@ define void @call_copy_notpod() { ; CHECK-LABEL: call_copy_notpod: ; CHECK: .seh_proc call_copy_notpod ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: str x19, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .seh_save_reg_x x19, 16 -; CHECK-NEXT: str x30, [sp, #8] // 8-byte Spill -; CHECK-NEXT: .seh_save_reg x30, 8 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: stp x19, x30, [sp] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_lrpair x19, 0 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: adrp x19, NotPod ; CHECK-NEXT: add x19, x19, :lo12:NotPod @@ -222,10 +218,10 @@ define void @call_copy_notpod() { ; CHECK-NEXT: bl copy_notpod ; CHECK-NEXT: stp x0, x1, [x19] ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Reload -; CHECK-NEXT: .seh_save_reg x30, 8 -; CHECK-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg_x x19, 16 +; CHECK-NEXT: ldp x19, x30, [sp] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_lrpair x19, 0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: .seh_endepilogue ; CHECK-NEXT: ret ; CHECK-NEXT: .seh_endfunclet diff --git a/llvm/test/CodeGen/AArch64/win64_vararg2.ll b/llvm/test/CodeGen/AArch64/win64_vararg2.ll index 548e6ac5fc0f..24e815eb65d7 100644 --- a/llvm/test/CodeGen/AArch64/win64_vararg2.ll +++ b/llvm/test/CodeGen/AArch64/win64_vararg2.ll @@ -9,10 +9,8 @@ define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: sub sp, sp, #80 ; CHECK-NEXT: .seh_stackalloc 80 -; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEXT: .seh_save_reg x19, 16 -; CHECK-NEXT: str x30, [sp, #24] // 8-byte Spill -; CHECK-NEXT: .seh_save_reg x30, 24 +; CHECK-NEXT: stp x19, x30, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_lrpair x19, 16 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: add x8, sp, #40 ; CHECK-NEXT: mov w19, w0 @@ -27,10 +25,8 @@ define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) { ; CHECK-NEXT: cmp w19, w0 ; CHECK-NEXT: cset w0, ls ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldr x30, [sp, #24] // 8-byte Reload -; CHECK-NEXT: .seh_save_reg x30, 24 -; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEXT: .seh_save_reg x19, 16 +; CHECK-NEXT: ldp x19, x30, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_lrpair x19, 16 ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: .seh_stackalloc 80 ; CHECK-NEXT: .seh_endepilogue @@ -43,10 +39,8 @@ define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) { ; GISEL-NEXT: // %bb.0: ; GISEL-NEXT: sub sp, sp, #80 ; GISEL-NEXT: .seh_stackalloc 80 -; GISEL-NEXT: str x19, [sp, #16] // 8-byte Spill -; GISEL-NEXT: .seh_save_reg x19, 16 -; GISEL-NEXT: str x30, [sp, #24] // 8-byte Spill -; GISEL-NEXT: .seh_save_reg x30, 24 +; GISEL-NEXT: stp x19, x30, [sp, #16] // 16-byte Folded Spill +; GISEL-NEXT: .seh_save_lrpair x19, 16 ; GISEL-NEXT: .seh_endprologue ; GISEL-NEXT: add x8, sp, #40 ; GISEL-NEXT: mov w19, w0 @@ -61,10 +55,8 @@ define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) { ; GISEL-NEXT: cmp w19, w0 ; GISEL-NEXT: cset w0, ls ; GISEL-NEXT: .seh_startepilogue -; GISEL-NEXT: ldr x30, [sp, #24] // 8-byte Reload -; GISEL-NEXT: .seh_save_reg x30, 24 -; GISEL-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; GISEL-NEXT: .seh_save_reg x19, 16 +; GISEL-NEXT: ldp x19, x30, [sp, #16] // 16-byte Folded Reload +; GISEL-NEXT: .seh_save_lrpair x19, 16 ; GISEL-NEXT: add sp, sp, #80 ; GISEL-NEXT: .seh_stackalloc 80 ; GISEL-NEXT: .seh_endepilogue diff --git a/llvm/test/CodeGen/AArch64/wineh-pac.ll b/llvm/test/CodeGen/AArch64/wineh-pac.ll index 4a95b159ff85..71dcaa3b8265 100644 --- a/llvm/test/CodeGen/AArch64/wineh-pac.ll +++ b/llvm/test/CodeGen/AArch64/wineh-pac.ll @@ -7,19 +7,19 @@ define dso_local i32 @func(ptr %g, i32 %a) "sign-return-address"="non-leaf" "sig ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: hint #27 ; CHECK-NEXT: .seh_pac_sign_lr -; CHECK-NEXT: str x19, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .seh_save_reg_x x19, 16 -; CHECK-NEXT: str x30, [sp, #8] // 8-byte Spill -; CHECK-NEXT: .seh_save_reg x30, 8 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: stp x19, x30, [sp] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_lrpair x19, 0 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: mov w19, w1 ; CHECK-NEXT: blr x0 ; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Reload -; CHECK-NEXT: .seh_save_reg x30, 8 -; CHECK-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg_x x19, 16 +; CHECK-NEXT: ldp x19, x30, [sp] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_lrpair x19, 0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: hint #31 ; CHECK-NEXT: .seh_pac_sign_lr ; CHECK-NEXT: .seh_endepilogue @@ -40,19 +40,19 @@ define dso_local i32 @func2(ptr %g, i32 %a) "sign-return-address"="non-leaf" "si ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: pacibsp ; CHECK-NEXT: .seh_pac_sign_lr -; CHECK-NEXT: str x19, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .seh_save_reg_x x19, 16 -; CHECK-NEXT: str x30, [sp, #8] // 8-byte Spill -; CHECK-NEXT: .seh_save_reg x30, 8 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: stp x19, x30, [sp] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_lrpair x19, 0 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: mov w19, w1 ; CHECK-NEXT: blr x0 ; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Reload -; CHECK-NEXT: .seh_save_reg x30, 8 -; CHECK-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg_x x19, 16 +; CHECK-NEXT: ldp x19, x30, [sp] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_lrpair x19, 0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: autibsp ; CHECK-NEXT: .seh_pac_sign_lr ; CHECK-NEXT: .seh_endepilogue diff --git a/llvm/test/CodeGen/AArch64/wineh-save-lrpair2.mir b/llvm/test/CodeGen/AArch64/wineh-save-lrpair2.mir index a78ed0694674..a91340cb534a 100644 --- a/llvm/test/CodeGen/AArch64/wineh-save-lrpair2.mir +++ b/llvm/test/CodeGen/AArch64/wineh-save-lrpair2.mir @@ -1,13 +1,13 @@ # RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \ # RUN: -stop-after=prologepilog | FileCheck %s -# Check that lr isn't paired with a GPR if it's the first pair, as -# that can't be described as a SEH opcode if combined with predecrement. +# Check that when LR is paired with a GPR, we don't combine it into a +# predecrement that can't be described as a SEH opcode. -# CHECK: early-clobber $sp = frame-setup STRXpre killed $x19, $sp, -16 -# CHECK-NEXT: frame-setup SEH_SaveReg_X 19, -16 -# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 1 -# CHECK-NEXT: frame-setup SEH_SaveReg 30, 8 +# CHECK: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: frame-setup SEH_StackAlloc 16 +# CHECK-NEXT: frame-setup STPXi killed $x19, killed $lr, $sp, 0 +# CHECK-NEXT: frame-setup SEH_SaveRegP 19, 30, 0 # CHECK-NEXT: frame-setup SEH_PrologEnd --- | diff --git a/llvm/test/DebugInfo/COFF/AArch64/arm64-register-variables.ll b/llvm/test/DebugInfo/COFF/AArch64/arm64-register-variables.ll index 950594ed105d..4b000518ea5e 100644 --- a/llvm/test/DebugInfo/COFF/AArch64/arm64-register-variables.ll +++ b/llvm/test/DebugInfo/COFF/AArch64/arm64-register-variables.ll @@ -21,18 +21,18 @@ ; } -; OBJ: DefRangeRegisterRelSym { -; OBJ: Kind: S_DEFRANGE_REGISTER_REL (0x1145) -; OBJ: BaseRegister: ARM64_SP (0x51) -; OBJ: HasSpilledUDTMember: No -; OBJ: OffsetInParent: 0 -; OBJ: BasePointerOffset: 12 -; OBJ: LocalVariableAddrRange { -; OBJ: OffsetStart: .text+0x14 -; OBJ: ISectStart: 0x0 -; OBJ: Range: 0x30 -; OBJ: } -; OBJ: } +; OBJ: DefRangeRegisterRelSym { +; OBJ-NEXT: Kind: S_DEFRANGE_REGISTER_REL (0x1145) +; OBJ-NEXT: BaseRegister: ARM64_SP (0x51) +; OBJ-NEXT: HasSpilledUDTMember: No +; OBJ-NEXT: OffsetInParent: 0 +; OBJ-NEXT: BasePointerOffset: 12 +; OBJ-NEXT: LocalVariableAddrRange { +; OBJ-NEXT: OffsetStart: .text+0x10 +; OBJ-NEXT: ISectStart: 0x0 +; OBJ-NEXT: Range: 0x2C +; OBJ-NEXT: } +; OBJ-NEXT: } ; ModuleID = 't.cpp' source_filename = "test/DebugInfo/COFF/register-variables-arm64.ll"