
When a loop contains a VMEM load whose result is only used outside the loop, do not bother to flush vmcnt in the loop head on GFX12. A wait for vmcnt will be required inside the loop anyway, because VMEM instructions can write their VGPR results out of order.
867 lines
22 KiB
YAML
867 lines
22 KiB
YAML
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s
|
|
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s
|
|
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s
|
|
|
|
---
|
|
|
|
# The loop contains a store and a use of a value loaded outside of the loop.
|
|
# We expect the waitcnt for the use to be hoisted on GFX9, but not on GFX10+
|
|
# because we have the vscnt counter.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
name: waitcnt_vm_loop
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# Same as before, but the loop preheader has no terminator.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop_noterm
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop_noterm
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop_noterm
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
name: waitcnt_vm_loop_noterm
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# Same as before but there is a preexisting waitcnt in the preheader.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop_noterm_wait
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
name: waitcnt_vm_loop_noterm_wait
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
S_WAITCNT 3952
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# The loop contains a store, a load, and uses values loaded both inside and
|
|
# outside the loop.
|
|
# We do not expect the waitcnt to be hoisted out of the loop.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop_load
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop_load
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop_load
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
name: waitcnt_vm_loop_load
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
$vgpr7 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# The loop contains a use of a value loaded outside of the loop, and no store
|
|
# nor load.
|
|
# We do not expect the waitcnt to be hoisted out of the loop.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop_no_store
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop_no_store
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop_no_store
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
name: waitcnt_vm_loop_no_store
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# The loop contains a store, no load, and doesn't use any value loaded inside
|
|
# or outside of the loop. There is only one use of the loaded value in the
|
|
# exit block.
|
|
# We don't expect any s_waitcnt vmcnt in the loop body or preheader, but expect
|
|
# one in the exit block.
|
|
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop_no_use
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop_no_use
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop_no_use
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
name: waitcnt_vm_loop_no_use
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
$vgpr1 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# The loop loads a value that is not used in the loop, and uses a value loaded
|
|
# outside of the loop.
|
|
# We expect the waitcnt to be hoisted of the loop to wait a single time before
|
|
# the loop is executed and avoid waiting for the load to complete on each
|
|
# iteration.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop2
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop2
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop2
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
name: waitcnt_vm_loop2
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
|
|
$vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# Same as before with an additional store in the loop. We still expect the
|
|
# waitcnt instructions to be hoisted.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop2_store
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop2_store
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop2_store
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
name: waitcnt_vm_loop2_store
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
|
|
$vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# Same as loop2 but the value loaded inside the loop is also used in the loop.
|
|
# We do not expect the waitcnt to be hoisted out of the loop.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop2_use_in_loop
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop2_use_in_loop
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop2_use_in_loop
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
name: waitcnt_vm_loop2_use_in_loop
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
|
|
$vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
$vgpr4 = V_ADD_U32_e32 $vgpr5, $vgpr1, implicit $exec
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# The loop contains a use of a value loaded outside of the loop, but we already
|
|
# waited for that load to complete. The loop also loads a value that is not used
|
|
# in the loop. We do not expect any waitcnt in the loop.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop2_nowait
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.3:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop2_nowait
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.3:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop2_nowait
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.3:
|
|
name: waitcnt_vm_loop2_nowait
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.2
|
|
|
|
$vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
|
|
$vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
|
|
$vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
|
|
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
successors: %bb.2, %bb.3
|
|
|
|
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
|
|
$vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.2, implicit killed $scc
|
|
S_BRANCH %bb.3
|
|
|
|
bb.3:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# Similar test case but for register intervals.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop2_reginterval
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop2_reginterval
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop2_reginterval
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12: GLOBAL_LOAD_DWORDX4
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
name: waitcnt_vm_loop2_reginterval
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec
|
|
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
$vgpr10 = COPY $vgpr0
|
|
|
|
$vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# Similar test case but for register intervals.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop2_reginterval2
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop2_reginterval2
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop2_reginterval2
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
name: waitcnt_vm_loop2_reginterval2
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec
|
|
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
$vgpr10 = COPY $vgpr0
|
|
|
|
$vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
|
|
$vgpr11 = COPY $vgpr7
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# The loop loads a value that is not used in the loop, but uses a value loaded
|
|
# outside of it. We expect the s_waitcnt instruction to be hoisted.
|
|
# A s_waitcnt vmcnt(0) is generated to flush in the preheader, but for this
|
|
# specific test case, it would be better to use vmcnt(1) instead. This is
|
|
# currently not implemented.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_zero
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9: S_WAITCNT 3952
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_zero
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10: S_WAITCNT 16240
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10-NOT: S_WAITCNT 16240
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_zero
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
|
|
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
|
|
name: waitcnt_vm_zero
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
$vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr3, implicit $exec
|
|
$vgpr2 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr3, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# This test case checks that we flush the vmcnt counter only if necessary
|
|
# (i.e. if a waitcnt is needed for the vgpr use we find in the loop)
|
|
|
|
# GFX10-LABEL: waitcnt_vm_necessary
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10: S_WAITCNT 16240
|
|
# GFX10: $vgpr4
|
|
# GFX10-NOT: S_WAITCNT
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10-NOT: S_WAITCNT
|
|
|
|
# GFX12-LABEL: waitcnt_vm_necessary
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12: $vgpr4
|
|
# GFX12-NOT: S_WAITCNT
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12-NOT: S_WAITCNT
|
|
|
|
# GFX9-LABEL: waitcnt_vm_necessary
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9: S_WAITCNT 3952
|
|
# GFX9: $vgpr4
|
|
# GFX9-NOT: S_WAITCNT
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9-NOT: S_WAITCNT
|
|
|
|
name: waitcnt_vm_necessary
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1(0x80000000)
|
|
|
|
$vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 killed $vgpr0_vgpr1, 0, 0, implicit $exec
|
|
$vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
|
|
|
bb.1:
|
|
successors: %bb.1(0x40000000)
|
|
|
|
$vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# The loop contains a global store, and uses a (global) loaded value outside of the loop.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop_global_mem
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop_global_mem
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop_global_mem
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
|
|
name: waitcnt_vm_loop_global_mem
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
|
GLOBAL_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
|
|
bb.2:
|
|
successors: %bb.3
|
|
S_BRANCH %bb.3
|
|
|
|
bb.3:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# Same as above case, but use scratch memory instructions instead
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop_scratch_mem
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop_scratch_mem
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop_scratch_mem
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
|
|
name: waitcnt_vm_loop_scratch_mem
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
$vgpr0 = SCRATCH_LOAD_DWORD $vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
|
SCRATCH_STORE_DWORD $vgpr4, $vgpr6, 0, 0, implicit $exec, implicit $flat_scr
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
|
|
bb.2:
|
|
successors: %bb.3
|
|
S_BRANCH %bb.3
|
|
|
|
bb.3:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# Same as above case, but use flat memory instructions instead
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop_flat_mem
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop_flat_mem
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10-NOT: S_WAITCNT 11
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10: S_WAITCNT 11
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop_flat_mem
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12: FLAT_LOAD_DWORD
|
|
# GFX12-NOT: S_WAIT_LOADCNT_DSCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT_DSCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
name: waitcnt_vm_loop_flat_mem
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
$vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
|
FLAT_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec, implicit $flat_scr
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
|
|
bb.2:
|
|
successors: %bb.3
|
|
S_BRANCH %bb.3
|
|
|
|
bb.3:
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
---
|
|
|
|
# The loop contains a store, a load, and uses values loaded both inside and
|
|
# outside the loop.
|
|
# We do not expect the waitcnt to be hoisted out of the loop.
|
|
|
|
# GFX9-LABEL: waitcnt_vm_loop_flat_load
|
|
# GFX9-LABEL: bb.0:
|
|
# GFX9-NOT: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.1:
|
|
# GFX9: S_WAITCNT 39
|
|
# GFX9-LABEL: bb.2:
|
|
|
|
# GFX10-LABEL: waitcnt_vm_loop_flat_load
|
|
# GFX10-LABEL: bb.0:
|
|
# GFX10-NOT: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.1:
|
|
# GFX10: S_WAITCNT 16
|
|
# GFX10-LABEL: bb.2:
|
|
|
|
# GFX12-LABEL: waitcnt_vm_loop_flat_load
|
|
# GFX12-LABEL: bb.0:
|
|
# GFX12-NOT: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.1:
|
|
# GFX12: S_WAIT_LOADCNT 0
|
|
# GFX12-LABEL: bb.2:
|
|
name: waitcnt_vm_loop_flat_load
|
|
body: |
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
|
|
GLOBAL_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec
|
|
$vgpr7 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec
|
|
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec
|
|
S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
S_ENDPGM 0
|
|
|
|
...
|