On GFX12+, GLOBAL_INV increments the loadcnt counter but does not write results to any VGPRs. Previously, we unconditionally inserted s_wait_loadcnt 0 at function returns even when the only pending loadcnt was from GLOBAL_INV instructions. This patch optimizes waitcnt insertion by skipping the loadcnt wait at function boundaries when no VGPRs have pending loads. This is determined by checking if any VGPR has a score greater than the lower bound for LOAD_CNT - if not, the pending loadcnt must be from non-VGPR-writing instructions like GLOBAL_INV. The optimization is limited to GFX12+ targets where GLOBAL_INV exists and uses the extended wait count instructions. This is a follow-up optimization to PR #135340 which added tracking for GLOBAL_INV in the waitcnt pass.
116 lines
4.6 KiB
YAML
116 lines
4.6 KiB
YAML
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s
|
|
|
|
# Test that we can optimize away s_wait_loadcnt at function boundaries when
|
|
# the only pending LOAD_CNT events are from GLOBAL_INV (which doesn't write
|
|
# to VGPRs).
|
|
#
|
|
# When a function contains only GLOBAL_INV with no actual VMEM loads pending
|
|
# to VGPRs, we should not need to emit s_wait_loadcnt 0 before the return.
|
|
|
|
---
|
|
# Test 1: Only GLOBAL_INV, no VGPR loads - should NOT need S_WAIT_LOADCNT
|
|
# before return because GLOBAL_INV doesn't write to VGPRs.
|
|
name: func_global_inv_only
|
|
tracksRegLiveness: true
|
|
machineFunctionInfo:
|
|
isEntryFunction: false
|
|
body: |
|
|
bb.0:
|
|
liveins: $sgpr30_sgpr31
|
|
|
|
; GFX12-LABEL: name: func_global_inv_only
|
|
; GFX12: liveins: $sgpr30_sgpr31
|
|
; GFX12-NEXT: {{ $}}
|
|
; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
|
|
; GFX12-NEXT: S_WAIT_EXPCNT 0
|
|
; GFX12-NEXT: S_WAIT_SAMPLECNT 0
|
|
; GFX12-NEXT: S_WAIT_BVHCNT 0
|
|
; GFX12-NEXT: S_WAIT_KMCNT 0
|
|
; GFX12-NEXT: GLOBAL_INV 16, implicit $exec
|
|
; GFX12-NOT: S_WAIT_LOADCNT
|
|
; GFX12-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
|
|
GLOBAL_INV 16, implicit $exec
|
|
S_SETPC_B64_return $sgpr30_sgpr31
|
|
...
|
|
---
|
|
# Test 2: GLOBAL_INV with actual VGPR load - MUST wait for loadcnt
|
|
name: func_global_inv_with_vgpr_load
|
|
tracksRegLiveness: true
|
|
machineFunctionInfo:
|
|
isEntryFunction: false
|
|
body: |
|
|
bb.0:
|
|
liveins: $vgpr0, $sgpr0_sgpr1, $sgpr30_sgpr31
|
|
|
|
; GFX12-LABEL: name: func_global_inv_with_vgpr_load
|
|
; GFX12: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr30_sgpr31
|
|
; GFX12-NEXT: {{ $}}
|
|
; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
|
|
; GFX12-NEXT: S_WAIT_EXPCNT 0
|
|
; GFX12-NEXT: S_WAIT_SAMPLECNT 0
|
|
; GFX12-NEXT: S_WAIT_BVHCNT 0
|
|
; GFX12-NEXT: S_WAIT_KMCNT 0
|
|
; GFX12-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
|
|
; GFX12-NEXT: GLOBAL_INV 16, implicit $exec
|
|
; GFX12-NEXT: S_WAIT_LOADCNT 0
|
|
; GFX12-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
|
|
renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
|
|
GLOBAL_INV 16, implicit $exec
|
|
S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
|
|
...
|
|
---
|
|
# Test 3: Only VGPR load (no GLOBAL_INV) - MUST wait for loadcnt
|
|
name: func_vgpr_load_no_global_inv
|
|
tracksRegLiveness: true
|
|
machineFunctionInfo:
|
|
isEntryFunction: false
|
|
body: |
|
|
bb.0:
|
|
liveins: $vgpr0, $sgpr0_sgpr1, $sgpr30_sgpr31
|
|
|
|
; GFX12-LABEL: name: func_vgpr_load_no_global_inv
|
|
; GFX12: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr30_sgpr31
|
|
; GFX12-NEXT: {{ $}}
|
|
; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
|
|
; GFX12-NEXT: S_WAIT_EXPCNT 0
|
|
; GFX12-NEXT: S_WAIT_SAMPLECNT 0
|
|
; GFX12-NEXT: S_WAIT_BVHCNT 0
|
|
; GFX12-NEXT: S_WAIT_KMCNT 0
|
|
; GFX12-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
|
|
; GFX12-NEXT: S_WAIT_LOADCNT 0
|
|
; GFX12-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
|
|
renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
|
|
S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
|
|
...
|
|
---
|
|
# Test 4: GLOBAL_INV with load already waited on - should NOT need S_WAIT_LOADCNT at return
|
|
# The load was waited on when $vgpr0 was used, so only GLOBAL_INV is pending at return.
|
|
name: func_global_inv_load_already_waited
|
|
tracksRegLiveness: true
|
|
machineFunctionInfo:
|
|
isEntryFunction: false
|
|
body: |
|
|
bb.0:
|
|
liveins: $vgpr0, $sgpr0_sgpr1, $sgpr30_sgpr31
|
|
|
|
; GFX12-LABEL: name: func_global_inv_load_already_waited
|
|
; GFX12: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr30_sgpr31
|
|
; GFX12-NEXT: {{ $}}
|
|
; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
|
|
; GFX12-NEXT: S_WAIT_EXPCNT 0
|
|
; GFX12-NEXT: S_WAIT_SAMPLECNT 0
|
|
; GFX12-NEXT: S_WAIT_BVHCNT 0
|
|
; GFX12-NEXT: S_WAIT_KMCNT 0
|
|
; GFX12-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
|
|
; GFX12-NEXT: S_WAIT_LOADCNT 0
|
|
; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
|
|
; GFX12-NEXT: GLOBAL_INV 16, implicit $exec
|
|
; GFX12-NOT: S_WAIT_LOADCNT
|
|
; GFX12-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
|
|
renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
|
|
$vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
|
|
GLOBAL_INV 16, implicit $exec
|
|
S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
|
|
...
|
|
|