Currently, chain functions are free to set up a stack pointer if they need one, and they assume they can start at scratch offset 0. This is not correct if CWSR and dynamic VGPRs are both enabled, since in that case we need to reserve an area at offset 0 for the trap handler, but only when running on a compute queue (which we determine at runtime). Rather than duplicate in every chain function the code sequence for determining if/how much scratch space needs to be reserved, this patch changes the ABI of chain functions so that they receive a stack pointer from their caller. Since chain functions can no longer use plain offsets to access their own stack, we'll also need to allocate a frame pointer more often (and sometimes also a base pointer). For simplicity, we use the same registers that `amdgpu_gfx` functions do (s32, s33, s34). This may change in the future. Chain functions never return to their caller and thus don't need to preserve the frame or base pointer. Another consequence is that now we might need to realign the stack in some cases (since it no longer starts at the infinitely aligned 0).
215 lines
11 KiB
YAML
215 lines
11 KiB
YAML
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
|
|
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s
|
|
|
|
# We're keeping the IR around for the callees and the CCs
|
|
|
|
--- |
|
|
declare amdgpu_cs_chain void @callee()
|
|
declare amdgpu_gfx void @gfx_callee()
|
|
|
|
define amdgpu_cs_chain void @preserve_inactive_wwm() {ret void}
|
|
define amdgpu_cs_chain void @dont_preserve_wwm_if_no_chain_calls() {ret void}
|
|
define amdgpu_cs_chain void @dont_preserve_wwm_if_init_whole_wave() {ret void}
|
|
define amdgpu_cs_chain void @dont_preserve_non_wwm() {ret void}
|
|
define amdgpu_cs_chain void @dont_preserve_v0_v7() {ret void}
|
|
define amdgpu_cs_chain void @dont_preserve_sgpr() {ret void}
|
|
...
|
|
---
|
|
|
|
# Check that we preserve the inactive lanes of registers v8+ received in the
|
|
# MachineFunctionInfo as wwmReservedRegs.
|
|
|
|
---
|
|
name: preserve_inactive_wwm
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
hasTailCall: true
|
|
machineFunctionInfo:
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
returnsVoid: true
|
|
wwmReservedRegs:
|
|
- '$vgpr8'
|
|
- '$vgpr9'
|
|
body: |
|
|
bb.0:
|
|
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
|
|
|
|
; GCN-LABEL: name: preserve_inactive_wwm
|
|
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
|
|
; GCN-NEXT: {{ $}}
|
|
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
|
|
; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
|
|
; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr9, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.1, addrspace 5)
|
|
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
|
|
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
|
|
; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: ("amdgpu-thread-private" load (s32) from %stack.0, addrspace 5)
|
|
; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.1, addrspace 5)
|
|
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
|
|
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
|
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
|
|
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
|
|
|
...
|
|
|
|
---
|
|
name: dont_preserve_wwm_if_no_chain_calls
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
hasTailCall: false
|
|
machineFunctionInfo:
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
returnsVoid: true
|
|
wwmReservedRegs:
|
|
- '$vgpr9'
|
|
body: |
|
|
bb.0:
|
|
liveins: $sgpr35, $vgpr8
|
|
|
|
; GCN-LABEL: name: dont_preserve_wwm_if_no_chain_calls
|
|
; GCN: liveins: $sgpr35, $vgpr8
|
|
; GCN-NEXT: {{ $}}
|
|
; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
|
|
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
|
|
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
|
|
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
|
|
; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
|
|
; GCN-NEXT: S_ENDPGM 0
|
|
renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
|
|
$sgpr35 = S_MOV_B32 5
|
|
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
|
|
renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
|
|
S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
|
|
S_ENDPGM 0
|
|
...
|
|
|
|
---
|
|
name: dont_preserve_wwm_if_init_whole_wave
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
hasTailCall: true
|
|
machineFunctionInfo:
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
returnsVoid: true
|
|
wwmReservedRegs:
|
|
- '$vgpr8'
|
|
- '$vgpr9'
|
|
hasInitWholeWave: true
|
|
body: |
|
|
bb.0:
|
|
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
|
|
|
|
; GCN-LABEL: name: dont_preserve_wwm_if_init_whole_wave
|
|
; GCN: liveins: $sgpr0, $sgpr35
|
|
; GCN-NEXT: {{ $}}
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr1
|
|
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
|
|
|
...
|
|
|
|
---
|
|
name: dont_preserve_non_wwm
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
hasTailCall: true
|
|
machineFunctionInfo:
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
isChainFunction: true
|
|
returnsVoid: true
|
|
body: |
|
|
bb.0:
|
|
liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr16
|
|
|
|
; GCN-LABEL: name: dont_preserve_non_wwm
|
|
; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr16
|
|
; GCN-NEXT: {{ $}}
|
|
; GCN-NEXT: renamable $vgpr16 = V_MOV_B32_e32 16, implicit $exec
|
|
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 8, implicit $exec
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
|
renamable $vgpr16 = V_MOV_B32_e32 16, implicit $exec
|
|
renamable $vgpr8 = V_MOV_B32_e32 8, implicit $exec
|
|
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
|
|
|
...
|
|
|
|
---
|
|
name: dont_preserve_v0_v7
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
hasTailCall: true
|
|
machineFunctionInfo:
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
isChainFunction: true
|
|
returnsVoid: true
|
|
wwmReservedRegs:
|
|
- '$vgpr1'
|
|
body: |
|
|
bb.0:
|
|
liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr7, $vgpr8, $vgpr9
|
|
|
|
; GCN-LABEL: name: dont_preserve_v0_v7
|
|
; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr7, $vgpr8, $vgpr9
|
|
; GCN-NEXT: {{ $}}
|
|
; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
|
|
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
|
|
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
|
|
; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
|
|
; GCN-NEXT: renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec
|
|
; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr0
|
|
; GCN-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr7
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
|
|
renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
|
|
$sgpr35 = S_MOV_B32 5
|
|
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
|
|
renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
|
|
renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec
|
|
renamable $vgpr8 = COPY killed renamable $vgpr0
|
|
renamable $vgpr9 = COPY killed renamable $vgpr7
|
|
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
|
|
|
|
...
|
|
|
|
---
|
|
name: dont_preserve_sgpr
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
hasTailCall: true
|
|
machineFunctionInfo:
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
returnsVoid: true
|
|
body: |
|
|
bb.0 (%ir-block.0):
|
|
liveins: $sgpr0
|
|
|
|
; GCN-LABEL: name: dont_preserve_sgpr
|
|
; GCN: liveins: $sgpr0
|
|
; GCN-NEXT: {{ $}}
|
|
; GCN-NEXT: renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc
|
|
; GCN-NEXT: $sgpr0 = COPY killed renamable $sgpr1
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0
|
|
renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc
|
|
$sgpr0 = COPY killed renamable $sgpr1
|
|
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0
|
|
|
|
...
|