llvm-project/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
Diana Picus f2e8e2faff
[AMDGPU] Make chain functions receive a stack pointer (#184616)
Currently, chain functions are free to set up a stack pointer if they
need one, and they assume they can start at scratch offset 0. This is
not correct if CWSR and dynamic VGPRs are both enabled, since in that
case we need to reserve an area at offset 0 for the trap handler, but
only when running on a compute queue (which we determine at runtime).
Rather than duplicate in every chain function the code sequence for
determining if/how much scratch space needs to be reserved, this patch
changes the ABI of chain functions so that they receive a stack pointer
from their caller.

Since chain functions can no longer use plain offsets to access their
own stack, we'll also need to allocate a frame pointer more often (and
sometimes also a base pointer). For simplicity, we use the same
registers that `amdgpu_gfx` functions do (s32, s33, s34). This may
change in the future. Chain functions never return to their caller and
thus don't need to preserve the frame or base pointer.

Another consequence is that now we might need to realign the stack in
some cases (since it no longer starts at the infinitely aligned 0).
2026-03-06 11:01:42 +01:00

215 lines
11 KiB
YAML

# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s
# We're keeping the IR around for the callees and the CCs
--- |
declare amdgpu_cs_chain void @callee()
declare amdgpu_gfx void @gfx_callee()
define amdgpu_cs_chain void @preserve_inactive_wwm() {ret void}
define amdgpu_cs_chain void @dont_preserve_wwm_if_no_chain_calls() {ret void}
define amdgpu_cs_chain void @dont_preserve_wwm_if_init_whole_wave() {ret void}
define amdgpu_cs_chain void @dont_preserve_non_wwm() {ret void}
define amdgpu_cs_chain void @dont_preserve_v0_v7() {ret void}
define amdgpu_cs_chain void @dont_preserve_sgpr() {ret void}
...
---
# Check that we preserve the inactive lanes of registers v8+ received in the
# MachineFunctionInfo as wwmReservedRegs.
---
name: preserve_inactive_wwm
tracksRegLiveness: true
frameInfo:
hasTailCall: true
machineFunctionInfo:
stackPtrOffsetReg: '$sgpr32'
returnsVoid: true
wwmReservedRegs:
- '$vgpr8'
- '$vgpr9'
body: |
bb.0:
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
; GCN-LABEL: name: preserve_inactive_wwm
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr9, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: ("amdgpu-thread-private" load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
...
---
name: dont_preserve_wwm_if_no_chain_calls
tracksRegLiveness: true
frameInfo:
hasTailCall: false
machineFunctionInfo:
stackPtrOffsetReg: '$sgpr32'
returnsVoid: true
wwmReservedRegs:
- '$vgpr9'
body: |
bb.0:
liveins: $sgpr35, $vgpr8
; GCN-LABEL: name: dont_preserve_wwm_if_no_chain_calls
; GCN: liveins: $sgpr35, $vgpr8
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
; GCN-NEXT: S_ENDPGM 0
renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
$sgpr35 = S_MOV_B32 5
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
S_ENDPGM 0
...
---
name: dont_preserve_wwm_if_init_whole_wave
tracksRegLiveness: true
frameInfo:
hasTailCall: true
machineFunctionInfo:
stackPtrOffsetReg: '$sgpr32'
returnsVoid: true
wwmReservedRegs:
- '$vgpr8'
- '$vgpr9'
hasInitWholeWave: true
body: |
bb.0:
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
; GCN-LABEL: name: dont_preserve_wwm_if_init_whole_wave
; GCN: liveins: $sgpr0, $sgpr35
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr1
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
...
---
name: dont_preserve_non_wwm
tracksRegLiveness: true
frameInfo:
hasTailCall: true
machineFunctionInfo:
stackPtrOffsetReg: '$sgpr32'
isChainFunction: true
returnsVoid: true
body: |
bb.0:
liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr16
; GCN-LABEL: name: dont_preserve_non_wwm
; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr16
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr16 = V_MOV_B32_e32 16, implicit $exec
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 8, implicit $exec
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
renamable $vgpr16 = V_MOV_B32_e32 16, implicit $exec
renamable $vgpr8 = V_MOV_B32_e32 8, implicit $exec
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
...
---
name: dont_preserve_v0_v7
tracksRegLiveness: true
frameInfo:
hasTailCall: true
machineFunctionInfo:
stackPtrOffsetReg: '$sgpr32'
isChainFunction: true
returnsVoid: true
wwmReservedRegs:
- '$vgpr1'
body: |
bb.0:
liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr7, $vgpr8, $vgpr9
; GCN-LABEL: name: dont_preserve_v0_v7
; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr7, $vgpr8, $vgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec
; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr0
; GCN-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr7
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
$sgpr35 = S_MOV_B32 5
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec
renamable $vgpr8 = COPY killed renamable $vgpr0
renamable $vgpr9 = COPY killed renamable $vgpr7
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
...
---
name: dont_preserve_sgpr
tracksRegLiveness: true
frameInfo:
hasTailCall: true
machineFunctionInfo:
stackPtrOffsetReg: '$sgpr32'
returnsVoid: true
body: |
bb.0 (%ir-block.0):
liveins: $sgpr0
; GCN-LABEL: name: dont_preserve_sgpr
; GCN: liveins: $sgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc
; GCN-NEXT: $sgpr0 = COPY killed renamable $sgpr1
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0
renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc
$sgpr0 = COPY killed renamable $sgpr1
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0
...