llvm-project/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
Sameer Sahasrabuddhe b02b395a1e
[AMDGPU] Asynchronous loads from global/buffer to LDS on pre-GFX12 (#180466)
The existing "LDS DMA" builtins/intrinsics copy data from global/buffer
pointer to LDS. These are now augmented with their ".async" version,
where the compiler does not automatically track completion. The
completion is now tracked using explicit mark/wait intrinsics, which
must be inserted by the user. This makes it possible to write programs
with efficient waits in software pipeline loops. The program can now
wait for only the oldest outstanding operations to finish, while
launching more operations for later use.

This change only contains the new names of the builtins/intrinsics,
which continue to behave exactly like their non-async counterparts. A
later change will implement the actual mark/wait semantics in
SIInsertWaitcnts.

This is part of a stack split out from #173259:
- #180467
- #180466

Fixes: SWDEV-521121
2026-02-11 05:26:58 +00:00

159 lines
6.5 KiB
YAML

# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s
# GCN-LABEL: name: buffer_load_dword_lds_ds_read
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: S_WAITCNT 3952
# vmcnt(0)
# GCN-NEXT: DS_READ_B32_gfx9
---
name: buffer_load_dword_lds_ds_read
body: |
bb.0:
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
...
# GCN-LABEL: name: buffer_load_dword_lds_vmcnt_1
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: BUFFER_LOAD_DWORD_IDXEN
# GCN-NEXT: S_WAITCNT 3953
# vmcnt(1)
# GCN-NEXT: DS_READ_B32_gfx9
---
name: buffer_load_dword_lds_vmcnt_1
body: |
bb.0:
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
$vgpr10 = BUFFER_LOAD_DWORD_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
...
# GCN-LABEL: name: buffer_load_dword_lds_flat_read
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: S_WAITCNT 3952
# vmcnt(0)
# GCN-NEXT: FLAT_LOAD_DWORD
---
name: buffer_load_dword_lds_flat_read
body: |
bb.0:
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`)
S_ENDPGM 0
...
# GCN-LABEL: name: global_load_lds_dword_ds_read
# GCN: GLOBAL_LOAD_LDS_DWORD
# GCN-NEXT: S_WAITCNT 3952
# vmcnt(0)
# GCN-NEXT: DS_READ_B32_gfx9
---
name: global_load_lds_dword_ds_read
body: |
bb.0:
$m0 = S_MOV_B32 0
GLOBAL_LOAD_LDS_DWORD $vgpr0_vgpr1, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
...
# Test that GLOBAL_LOAD_LDS does not increment lgkmcnt (LGKM_CNT = 0).
# GCN-LABEL: name: ds_read_global_load_lds_use_ds_data
# GCN: DS_READ_B32_gfx9
# GCN-NEXT: GLOBAL_LOAD_LDS_DWORD
# GCN-NEXT: S_WAITCNT 49279
# lgkmcnt(0)
# GCN-NEXT: V_ADD_U32_e32
---
name: ds_read_global_load_lds_use_ds_data
body: |
bb.0:
$m0 = S_MOV_B32 0
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
GLOBAL_LOAD_LDS_DWORD $vgpr2_vgpr3, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
S_ENDPGM 0
...
# Test that BUFFER_LOAD_DWORD_LDS does not increment lgkmcnt.
# DS_READ increments lgkmcnt. When using the DS_READ result, we wait for lgkmcnt(0).
# GCN-LABEL: name: ds_read_buffer_load_lds_use_ds_data
# GCN: DS_READ_B32_gfx9
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: S_WAITCNT 49279
# lgkmcnt(0)
# GCN-NEXT: V_ADD_U32_e32
---
name: ds_read_buffer_load_lds_use_ds_data
body: |
bb.0:
$m0 = S_MOV_B32 0
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
$vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
S_ENDPGM 0
...
# GCN-LABEL: name: scratch_load_lds_dword_ds_read
# GCN: SCRATCH_LOAD_LDS_DWORD
# GCN-NEXT: S_WAITCNT 3952
# vmcnt(0)
# GCN-NEXT: DS_READ_B32_gfx9
---
name: scratch_load_lds_dword_ds_read
body: |
bb.0:
$m0 = S_MOV_B32 0
SCRATCH_LOAD_LDS_DWORD $vgpr0, 4, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(5) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
...
# GCN-LABEL: name: buffer_store_lds_dword_ds_read
# GCN: BUFFER_STORE_LDS_DWORD
# GCN-NEXT: DS_READ_B32_gfx9
---
name: buffer_store_lds_dword_ds_read
body: |
bb.0:
$m0 = S_MOV_B32 0
BUFFER_STORE_LDS_DWORD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(3) poison` + 4), (store (s32) into `ptr addrspace(1) poison` + 4)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
...
# No need to wait before load from VMEM to LDS.
# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: S_WAITCNT 3952
# vmcnt(0)
# GCN-NEXT: DS_READ_B32_gfx9
---
name: series_of_buffer_load_dword_lds_ds_read
body: |
bb.0:
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
...