llvm-project/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
Benjamin Maxwell eb764040bc
[AArch64][SME] Implement the SME ABI (ZA state management) in Machine IR (#149062)
## Short Summary

This patch adds a new pass `aarch64-machine-sme-abi` to handle the ABI
for ZA state (e.g., lazy saves and agnostic ZA functions). This is
currently not enabled by default (but aims to be by LLVM 22). The goal
is for this new pass to more optimally place ZA saves/restores and to
work with exception handling.

## Long Description

This patch reimplements management of ZA state for functions with
private and shared ZA state. Agnostic ZA functions will be handled in a
later patch. For now, this is under the flag `-aarch64-new-sme-abi`,
however, we intend for this to replace the current SelectionDAG
implementation once complete.

The approach taken here is to mark instructions as needing ZA to be in a
specific ("ACTIVE" or "LOCAL_SAVED"). Machine instructions implicitly
defining or using ZA registers (such as $zt0 or $zab0) require the
"ACTIVE" state. Function calls may need the "LOCAL_SAVED" or "ACTIVE"
state depending on the callee (having shared or private ZA).

We already add ZA register uses/definitions to machine instructions, so
no extra work is needed to mark these.

Calls need to be marked by glueing Arch64ISD::INOUT_ZA_USE or
Arch64ISD::REQUIRES_ZA_SAVE to the CALLSEQ_START.

These markers are then used by the MachineSMEABIPass to find
instructions where there is a transition between required ZA states.
These are the points we need to insert code to set up or restore a ZA
save (or initialize ZA).

To handle control flow between blocks (which may have different ZA state
requirements), we bundle the incoming and outgoing edges of blocks.
Bundles are formed by assigning each block an incoming and outgoing
bundle (initially, all blocks have their own two bundles). Bundles are
then combined by joining the outgoing bundle of a block with the
incoming bundle of all successors.

These bundles are then assigned a ZA state based on the blocks that
participate in the bundle. Blocks whose incoming edges are in a bundle
"vote" for a ZA state that matches the state required at the first
instruction in the block, and likewise, blocks whose outgoing edges are
in a bundle vote for the ZA state that matches the last instruction in
the block. The ZA state with the most votes is used, which aims to
minimize the number of state transitions.
2025-08-19 10:00:28 +01:00

209 lines
7.7 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING
define i32 @no_tpidr2_save_required() "aarch64_inout_za" {
; CHECK-COMMON-LABEL: no_tpidr2_save_required:
; CHECK-COMMON: // %bb.0: // %entry
; CHECK-COMMON-NEXT: mov w0, #42 // =0x2a
; CHECK-COMMON-NEXT: ret
entry:
ret i32 42
}
define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_inout_za" {
; CHECK-LABEL: multi_bb_stpidr2_save_required:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa w29, 16
; CHECK-NEXT: .cfi_offset w30, -8
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: msub x8, x8, x8, x9
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: stur x8, [x29, #-16]
; CHECK-NEXT: sturh wzr, [x29, #-6]
; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: cbz w0, .LBB1_2
; CHECK-NEXT: // %bb.1: // %use_b
; CHECK-NEXT: fmov s1, #4.00000000
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: b .LBB1_5
; CHECK-NEXT: .LBB1_2: // %use_c
; CHECK-NEXT: fmov s0, s1
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sturh w8, [x29, #-8]
; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl cosf
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
; CHECK-NEXT: cbnz x8, .LBB1_4
; CHECK-NEXT: // %bb.3: // %use_c
; CHECK-NEXT: bl __arm_tpidr2_restore
; CHECK-NEXT: .LBB1_4: // %use_c
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: .LBB1_5: // %exit
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: multi_bb_stpidr2_save_required:
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: mov x29, sp
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 16
; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -8
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
; CHECK-NEWLOWERING-NEXT: mov x9, sp
; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
; CHECK-NEWLOWERING-NEXT: mov sp, x9
; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB1_2
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b
; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000
; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1
; CHECK-NEWLOWERING-NEXT: b .LBB1_3
; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %use_c
; CHECK-NEWLOWERING-NEXT: fmov s0, s1
; CHECK-NEWLOWERING-NEXT: bl cosf
; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %exit
; CHECK-NEWLOWERING-NEXT: smstart za
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_5
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
; CHECK-NEWLOWERING-NEXT: .LBB1_5: // %exit
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: mov sp, x29
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ret
%cmp = icmp ne i32 %a, 0
br i1 %cmp, label %use_b, label %use_c
use_b:
%faddr = fadd float %b, 4.0
br label %exit
use_c:
%res2 = call float @llvm.cos.f32(float %c)
br label %exit
exit:
%ret = phi float [%faddr, %use_b], [%res2, %use_c]
ret float %ret
}
; FIXME: This is missing stack probes with -aarch64-new-sme-abi.
define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" {
; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: str xzr, [sp, #-16]!
; CHECK-NEXT: .cfi_def_cfa w29, 16
; CHECK-NEXT: .cfi_offset w30, -8
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: msub x8, x8, x8, x9
; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT: cmp sp, x8
; CHECK-NEXT: b.le .LBB2_3
; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
; CHECK-NEXT: str xzr, [sp]
; CHECK-NEXT: b .LBB2_1
; CHECK-NEXT: .LBB2_3:
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: ldr xzr, [sp]
; CHECK-NEXT: stur x8, [x29, #-16]
; CHECK-NEXT: sturh wzr, [x29, #-6]
; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: cbz w0, .LBB2_5
; CHECK-NEXT: // %bb.4: // %use_b
; CHECK-NEXT: fmov s1, #4.00000000
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: b .LBB2_8
; CHECK-NEXT: .LBB2_5: // %use_c
; CHECK-NEXT: fmov s0, s1
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sturh w8, [x29, #-8]
; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl cosf
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
; CHECK-NEXT: cbnz x8, .LBB2_7
; CHECK-NEXT: // %bb.6: // %use_c
; CHECK-NEXT: bl __arm_tpidr2_restore
; CHECK-NEXT: .LBB2_7: // %use_c
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: .LBB2_8: // %exit
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: multi_bb_stpidr2_save_required_stackprobe:
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: mov x29, sp
; CHECK-NEWLOWERING-NEXT: str xzr, [sp, #-16]!
; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 16
; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -8
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
; CHECK-NEWLOWERING-NEXT: mov x9, sp
; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
; CHECK-NEWLOWERING-NEXT: mov sp, x9
; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_2
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b
; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000
; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1
; CHECK-NEWLOWERING-NEXT: b .LBB2_3
; CHECK-NEWLOWERING-NEXT: .LBB2_2: // %use_c
; CHECK-NEWLOWERING-NEXT: fmov s0, s1
; CHECK-NEWLOWERING-NEXT: bl cosf
; CHECK-NEWLOWERING-NEXT: .LBB2_3: // %exit
; CHECK-NEWLOWERING-NEXT: smstart za
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_5
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %exit
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: mov sp, x29
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ret
%cmp = icmp ne i32 %a, 0
br i1 %cmp, label %use_b, label %use_c
use_b:
%faddr = fadd float %b, 4.0
br label %exit
use_c:
%res2 = call float @llvm.cos.f32(float %c)
br label %exit
exit:
%ret = phi float [%faddr, %use_b], [%res2, %use_c]
ret float %ret
}
declare float @llvm.cos.f32(float)