llvm-project/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
Sameer Sahasrabuddhe 8f187c74b3
[AMDGPU] introduce S_WAITCNT_LDS_DIRECT in the memory legalizer (#150887)
The new instruction represents the unknown number of waitcnts needed at a
release operation to ensure that prior direct loads to LDS (formerly called LDS
DMA) are completed. The instruction is replaced in SIInsertWaitcnts with a
suitable value for vmcnt().

Co-authored-by: Austin Kerbow <austin.kerbow@amd.com>.
2025-07-30 11:23:28 +05:30

134 lines
6.1 KiB
YAML

# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s
# Expected vmcnt(0) since the direct load is the only load.
---
name: dma_then_fence
body: |
bb.0:
; GCN-LABEL: name: dma_then_fence
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
S_WAITCNT_lds_direct
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0
...
# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.
---
name: dma_then_global_load
body: |
bb.0:
; GCN-LABEL: name: dma_then_global_load
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3953
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_lds_direct
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0
...
# Expected no vmcnt since there is no direct load to LDS, and the global load is not processed by SIInsertWaitcnts.
---
name: no_dma_just_fence
body: |
bb.0:
; GCN-LABEL: name: no_dma_just_fence
; GCN: S_WAITCNT 0
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_lds_direct
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0
...
# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.
---
name: dma_then_system_fence
body: |
bb.0:
; GCN-LABEL: name: dma_then_system_fence
; GCN: S_WAITCNT 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3953
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_lds_direct
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0
...
# The computed vmcnt(1) gets merged with the existing vmcnt(0).
---
name: merge_with_prev_wait
body: |
bb.0:
; GCN-LABEL: name: merge_with_prev_wait
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT 3952
S_WAITCNT_lds_direct
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0
...
# The computed vmcnt(1) gets merged with the existing vmcnt(0).
---
name: merge_with_next_wait
body: |
bb.0:
; GCN-LABEL: name: merge_with_next_wait
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_lds_direct
S_WAITCNT 3952
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0
...