llvm-project/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
Alexander Weinrauch 3b16468814
[AMDGPU] Global and Buffer loads to LDS should not increase lgkmcnt (#179305)
`global_load_lds` and `buffer_load to lds` do only increment `vmcnt` and
not touch `lgkmcnt`. This causes invalid `waitcnts` for some Triton
kernels, similar to the added lit tests.

Note that the change for buffer ops is not necesssary, i.e. the lit test
passes even before this PR, because it seems like `SIInsertWaitcnts`
does not use `LGKM_CNT` for buffer ops. But this change might prevent a
bug in the future.
2026-02-05 09:36:00 -08:00

159 lines
6.5 KiB
YAML

# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s
# GCN-LABEL: name: buffer_load_dword_lds_ds_read
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: S_WAITCNT 3952
# vmcnt(0)
# GCN-NEXT: DS_READ_B32_gfx9
---
name: buffer_load_dword_lds_ds_read
body: |
bb.0:
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
...
# GCN-LABEL: name: buffer_load_dword_lds_vmcnt_1
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: BUFFER_LOAD_DWORD_IDXEN
# GCN-NEXT: S_WAITCNT 3953
# vmcnt(1)
# GCN-NEXT: DS_READ_B32_gfx9
---
name: buffer_load_dword_lds_vmcnt_1
body: |
bb.0:
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
$vgpr10 = BUFFER_LOAD_DWORD_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
...
# GCN-LABEL: name: buffer_load_dword_lds_flat_read
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: S_WAITCNT 3952
# vmcnt(0)
# GCN-NEXT: FLAT_LOAD_DWORD
---
name: buffer_load_dword_lds_flat_read
body: |
bb.0:
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`)
S_ENDPGM 0
...
# GCN-LABEL: name: global_load_lds_dword_ds_read
# GCN: GLOBAL_LOAD_LDS_DWORD
# GCN-NEXT: S_WAITCNT 3952
# vmcnt(0)
# GCN-NEXT: DS_READ_B32_gfx9
---
name: global_load_lds_dword_ds_read
body: |
bb.0:
$m0 = S_MOV_B32 0
GLOBAL_LOAD_LDS_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
...
# Test that GLOBAL_LOAD_LDS does not increment lgkmcnt (LGKM_CNT = 0).
# GCN-LABEL: name: ds_read_global_load_lds_use_ds_data
# GCN: DS_READ_B32_gfx9
# GCN-NEXT: GLOBAL_LOAD_LDS_DWORD
# GCN-NEXT: S_WAITCNT 49279
# lgkmcnt(0)
# GCN-NEXT: V_ADD_U32_e32
---
name: ds_read_global_load_lds_use_ds_data
body: |
bb.0:
$m0 = S_MOV_B32 0
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
GLOBAL_LOAD_LDS_DWORD $vgpr2_vgpr3, 4, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
S_ENDPGM 0
...
# Test that BUFFER_LOAD_DWORD_LDS does not increment lgkmcnt.
# DS_READ increments lgkmcnt. When using the DS_READ result, we wait for lgkmcnt(0).
# GCN-LABEL: name: ds_read_buffer_load_lds_use_ds_data
# GCN: DS_READ_B32_gfx9
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: S_WAITCNT 49279
# lgkmcnt(0)
# GCN-NEXT: V_ADD_U32_e32
---
name: ds_read_buffer_load_lds_use_ds_data
body: |
bb.0:
$m0 = S_MOV_B32 0
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
$vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
S_ENDPGM 0
...
# GCN-LABEL: name: scratch_load_lds_dword_ds_read
# GCN: SCRATCH_LOAD_LDS_DWORD
# GCN-NEXT: S_WAITCNT 3952
# vmcnt(0)
# GCN-NEXT: DS_READ_B32_gfx9
---
name: scratch_load_lds_dword_ds_read
body: |
bb.0:
$m0 = S_MOV_B32 0
SCRATCH_LOAD_LDS_DWORD $vgpr0, 4, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(5) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
...
# GCN-LABEL: name: buffer_store_lds_dword_ds_read
# GCN: BUFFER_STORE_LDS_DWORD
# GCN-NEXT: DS_READ_B32_gfx9
---
name: buffer_store_lds_dword_ds_read
body: |
bb.0:
$m0 = S_MOV_B32 0
BUFFER_STORE_LDS_DWORD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(3) poison` + 4), (store (s32) into `ptr addrspace(1) poison` + 4)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
...
# No need to wait before load from VMEM to LDS.
# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: S_WAITCNT 3952
# vmcnt(0)
# GCN-NEXT: DS_READ_B32_gfx9
---
name: series_of_buffer_load_dword_lds_ds_read
body: |
bb.0:
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
...