
## Short Summary This patch adds a new pass `aarch64-machine-sme-abi` to handle the ABI for ZA state (e.g., lazy saves and agnostic ZA functions). This is currently not enabled by default (but aims to be by LLVM 22). The goal is for this new pass to more optimally place ZA saves/restores and to work with exception handling. ## Long Description This patch reimplements management of ZA state for functions with private and shared ZA state. Agnostic ZA functions will be handled in a later patch. For now, this is under the flag `-aarch64-new-sme-abi`, however, we intend for this to replace the current SelectionDAG implementation once complete. The approach taken here is to mark instructions as needing ZA to be in a specific ("ACTIVE" or "LOCAL_SAVED"). Machine instructions implicitly defining or using ZA registers (such as $zt0 or $zab0) require the "ACTIVE" state. Function calls may need the "LOCAL_SAVED" or "ACTIVE" state depending on the callee (having shared or private ZA). We already add ZA register uses/definitions to machine instructions, so no extra work is needed to mark these. Calls need to be marked by glueing Arch64ISD::INOUT_ZA_USE or Arch64ISD::REQUIRES_ZA_SAVE to the CALLSEQ_START. These markers are then used by the MachineSMEABIPass to find instructions where there is a transition between required ZA states. These are the points we need to insert code to set up or restore a ZA save (or initialize ZA). To handle control flow between blocks (which may have different ZA state requirements), we bundle the incoming and outgoing edges of blocks. Bundles are formed by assigning each block an incoming and outgoing bundle (initially, all blocks have their own two bundles). Bundles are then combined by joining the outgoing bundle of a block with the incoming bundle of all successors. These bundles are then assigned a ZA state based on the blocks that participate in the bundle. Blocks whose incoming edges are in a bundle "vote" for a ZA state that matches the state required at the first instruction in the block, and likewise, blocks whose outgoing edges are in a bundle vote for the ZA state that matches the last instruction in the block. The ZA state with the most votes is used, which aims to minimize the number of state transitions.
179 lines
6.8 KiB
LLVM
179 lines
6.8 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
|
|
; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-NEWLOWERING
|
|
|
|
declare void @shared_za_callee() "aarch64_inout_za"
|
|
|
|
define void @private_za() "aarch64_new_za" {
|
|
; CHECK-LABEL: private_za:
|
|
; CHECK: // %bb.0: // %prelude
|
|
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: .cfi_offset w30, -16
|
|
; CHECK-NEXT: rdsvl x8, #1
|
|
; CHECK-NEXT: mrs x8, TPIDR2_EL0
|
|
; CHECK-NEXT: cbz x8, .LBB0_2
|
|
; CHECK-NEXT: b .LBB0_1
|
|
; CHECK-NEXT: .LBB0_1: // %save.za
|
|
; CHECK-NEXT: bl __arm_tpidr2_save
|
|
; CHECK-NEXT: mov x8, xzr
|
|
; CHECK-NEXT: msr TPIDR2_EL0, x8
|
|
; CHECK-NEXT: b .LBB0_2
|
|
; CHECK-NEXT: .LBB0_2:
|
|
; CHECK-NEXT: smstart za
|
|
; CHECK-NEXT: zero {za}
|
|
; CHECK-NEXT: bl shared_za_callee
|
|
; CHECK-NEXT: smstop za
|
|
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEXT: ret
|
|
;
|
|
; CHECK-NEWLOWERING-LABEL: private_za:
|
|
; CHECK-NEWLOWERING: // %bb.0:
|
|
; CHECK-NEWLOWERING-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
|
|
; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -16
|
|
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
|
|
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_1
|
|
; CHECK-NEWLOWERING-NEXT: b .LBB0_2
|
|
; CHECK-NEWLOWERING-NEXT: .LBB0_1:
|
|
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save
|
|
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
|
|
; CHECK-NEWLOWERING-NEXT: zero {za}
|
|
; CHECK-NEWLOWERING-NEXT: b .LBB0_2
|
|
; CHECK-NEWLOWERING-NEXT: .LBB0_2:
|
|
; CHECK-NEWLOWERING-NEXT: smstart za
|
|
; CHECK-NEWLOWERING-NEXT: bl shared_za_callee
|
|
; CHECK-NEWLOWERING-NEXT: smstop za
|
|
; CHECK-NEWLOWERING-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
|
|
; CHECK-NEWLOWERING-NEXT: ret
|
|
call void @shared_za_callee()
|
|
ret void
|
|
}
|
|
|
|
; Note: This test must run at -O0 as otherwise the multiple exits are optimized out.
|
|
; TODO: We should be able to omit the ZA save here (as this function does not use ZA).
|
|
define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_new_za" {
|
|
; CHECK-LABEL: private_za_multiple_exit:
|
|
; CHECK: // %bb.0: // %prelude
|
|
; CHECK-NEXT: sub sp, sp, #32
|
|
; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 32
|
|
; CHECK-NEXT: .cfi_offset w30, -16
|
|
; CHECK-NEXT: str x2, [sp] // 8-byte Folded Spill
|
|
; CHECK-NEXT: str w1, [sp, #8] // 4-byte Folded Spill
|
|
; CHECK-NEXT: str w0, [sp, #12] // 4-byte Folded Spill
|
|
; CHECK-NEXT: rdsvl x8, #1
|
|
; CHECK-NEXT: mrs x8, TPIDR2_EL0
|
|
; CHECK-NEXT: cbz x8, .LBB1_2
|
|
; CHECK-NEXT: b .LBB1_1
|
|
; CHECK-NEXT: .LBB1_1: // %save.za
|
|
; CHECK-NEXT: bl __arm_tpidr2_save
|
|
; CHECK-NEXT: mov x8, xzr
|
|
; CHECK-NEXT: msr TPIDR2_EL0, x8
|
|
; CHECK-NEXT: b .LBB1_2
|
|
; CHECK-NEXT: .LBB1_2: // %entry
|
|
; CHECK-NEXT: ldr x8, [sp] // 8-byte Folded Reload
|
|
; CHECK-NEXT: smstart za
|
|
; CHECK-NEXT: zero {za}
|
|
; CHECK-NEXT: subs x8, x8, #1
|
|
; CHECK-NEXT: b.ne .LBB1_4
|
|
; CHECK-NEXT: b .LBB1_3
|
|
; CHECK-NEXT: .LBB1_3: // %if.else
|
|
; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload
|
|
; CHECK-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload
|
|
; CHECK-NEXT: add w0, w8, w9
|
|
; CHECK-NEXT: smstop za
|
|
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
|
|
; CHECK-NEXT: add sp, sp, #32
|
|
; CHECK-NEXT: ret
|
|
; CHECK-NEXT: .LBB1_4: // %if.end
|
|
; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload
|
|
; CHECK-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload
|
|
; CHECK-NEXT: subs w0, w8, w9
|
|
; CHECK-NEXT: smstop za
|
|
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
|
|
; CHECK-NEXT: add sp, sp, #32
|
|
; CHECK-NEXT: ret
|
|
;
|
|
; CHECK-NEWLOWERING-LABEL: private_za_multiple_exit:
|
|
; CHECK-NEWLOWERING: // %bb.0: // %entry
|
|
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
|
|
; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
|
|
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_1
|
|
; CHECK-NEWLOWERING-NEXT: b .LBB1_2
|
|
; CHECK-NEWLOWERING-NEXT: .LBB1_1: // %entry
|
|
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_save
|
|
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
|
|
; CHECK-NEWLOWERING-NEXT: zero {za}
|
|
; CHECK-NEWLOWERING-NEXT: b .LBB1_2
|
|
; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %entry
|
|
; CHECK-NEWLOWERING-NEXT: smstart za
|
|
; CHECK-NEWLOWERING-NEXT: str w1, [sp, #8] // 4-byte Folded Spill
|
|
; CHECK-NEWLOWERING-NEXT: str w0, [sp, #12] // 4-byte Folded Spill
|
|
; CHECK-NEWLOWERING-NEXT: subs x8, x2, #1
|
|
; CHECK-NEWLOWERING-NEXT: b.ne .LBB1_4
|
|
; CHECK-NEWLOWERING-NEXT: b .LBB1_3
|
|
; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %if.else
|
|
; CHECK-NEWLOWERING-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload
|
|
; CHECK-NEWLOWERING-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload
|
|
; CHECK-NEWLOWERING-NEXT: add w0, w8, w9
|
|
; CHECK-NEWLOWERING-NEXT: smstop za
|
|
; CHECK-NEWLOWERING-NEXT: add sp, sp, #16
|
|
; CHECK-NEWLOWERING-NEXT: ret
|
|
; CHECK-NEWLOWERING-NEXT: .LBB1_4: // %if.end
|
|
; CHECK-NEWLOWERING-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload
|
|
; CHECK-NEWLOWERING-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload
|
|
; CHECK-NEWLOWERING-NEXT: subs w0, w8, w9
|
|
; CHECK-NEWLOWERING-NEXT: smstop za
|
|
; CHECK-NEWLOWERING-NEXT: add sp, sp, #16
|
|
; CHECK-NEWLOWERING-NEXT: ret
|
|
entry:
|
|
%tobool = icmp eq i64 %cond, 1
|
|
br i1 %tobool, label %if.else, label %if.end
|
|
|
|
if.else:
|
|
%add = add i32 %a, %b
|
|
ret i32 %add
|
|
|
|
if.end:
|
|
%sub = sub i32 %a, %b
|
|
ret i32 %sub
|
|
}
|
|
|
|
; In simple cases like this we should omit all ZA setup.
|
|
define i32 @private_za_trivially_does_not_use_za(i32 %x) "aarch64_new_za" {
|
|
; CHECK-LABEL: private_za_trivially_does_not_use_za:
|
|
; CHECK: // %bb.0: // %prelude
|
|
; CHECK-NEXT: sub sp, sp, #32
|
|
; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 32
|
|
; CHECK-NEXT: .cfi_offset w30, -16
|
|
; CHECK-NEXT: str w0, [sp, #12] // 4-byte Folded Spill
|
|
; CHECK-NEXT: rdsvl x8, #1
|
|
; CHECK-NEXT: mrs x8, TPIDR2_EL0
|
|
; CHECK-NEXT: cbz x8, .LBB2_2
|
|
; CHECK-NEXT: b .LBB2_1
|
|
; CHECK-NEXT: .LBB2_1: // %save.za
|
|
; CHECK-NEXT: bl __arm_tpidr2_save
|
|
; CHECK-NEXT: mov x8, xzr
|
|
; CHECK-NEXT: msr TPIDR2_EL0, x8
|
|
; CHECK-NEXT: b .LBB2_2
|
|
; CHECK-NEXT: .LBB2_2:
|
|
; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload
|
|
; CHECK-NEXT: smstart za
|
|
; CHECK-NEXT: zero {za}
|
|
; CHECK-NEXT: add w0, w8, w8
|
|
; CHECK-NEXT: smstop za
|
|
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
|
|
; CHECK-NEXT: add sp, sp, #32
|
|
; CHECK-NEXT: ret
|
|
;
|
|
; CHECK-NEWLOWERING-LABEL: private_za_trivially_does_not_use_za:
|
|
; CHECK-NEWLOWERING: // %bb.0:
|
|
; CHECK-NEWLOWERING-NEXT: add w0, w0, w0
|
|
; CHECK-NEWLOWERING-NEXT: ret
|
|
%ret = add i32 %x, %x
|
|
ret i32 %ret
|
|
}
|