The existing "LDS DMA" builtins/intrinsics copy data from global/buffer pointer to LDS. These are now augmented with their ".async" version, where the compiler does not automatically track completion. The completion is now tracked using explicit mark/wait intrinsics, which must be inserted by the user. This makes it possible to write programs with efficient waits in software pipeline loops. The program can now wait for only the oldest outstanding operations to finish, while launching more operations for later use. This change only contains the new names of the builtins/intrinsics, which continue to behave exactly like their non-async counterparts. A later change will implement the actual mark/wait semantics in SIInsertWaitcnts. This is part of a stack split out from #173259: - #180467 - #180466 Fixes: SWDEV-521121
159 lines
6.5 KiB
YAML
159 lines
6.5 KiB
YAML
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s
|
|
|
|
# GCN-LABEL: name: buffer_load_dword_lds_ds_read
|
|
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
|
|
# GCN-NEXT: S_WAITCNT 3952
|
|
# vmcnt(0)
|
|
# GCN-NEXT: DS_READ_B32_gfx9
|
|
---
|
|
name: buffer_load_dword_lds_ds_read
|
|
body: |
|
|
bb.0:
|
|
$m0 = S_MOV_B32 0
|
|
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
|
|
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
|
|
# GCN-LABEL: name: buffer_load_dword_lds_vmcnt_1
|
|
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
|
|
# GCN-NEXT: BUFFER_LOAD_DWORD_IDXEN
|
|
# GCN-NEXT: S_WAITCNT 3953
|
|
# vmcnt(1)
|
|
# GCN-NEXT: DS_READ_B32_gfx9
|
|
---
|
|
name: buffer_load_dword_lds_vmcnt_1
|
|
body: |
|
|
bb.0:
|
|
$m0 = S_MOV_B32 0
|
|
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
|
|
$vgpr10 = BUFFER_LOAD_DWORD_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`)
|
|
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
|
|
# GCN-LABEL: name: buffer_load_dword_lds_flat_read
|
|
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
|
|
# GCN-NEXT: S_WAITCNT 3952
|
|
# vmcnt(0)
|
|
# GCN-NEXT: FLAT_LOAD_DWORD
|
|
---
|
|
name: buffer_load_dword_lds_flat_read
|
|
body: |
|
|
bb.0:
|
|
$m0 = S_MOV_B32 0
|
|
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
|
|
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`)
|
|
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
|
|
# GCN-LABEL: name: global_load_lds_dword_ds_read
|
|
# GCN: GLOBAL_LOAD_LDS_DWORD
|
|
# GCN-NEXT: S_WAITCNT 3952
|
|
# vmcnt(0)
|
|
# GCN-NEXT: DS_READ_B32_gfx9
|
|
---
|
|
name: global_load_lds_dword_ds_read
|
|
body: |
|
|
bb.0:
|
|
$m0 = S_MOV_B32 0
|
|
GLOBAL_LOAD_LDS_DWORD $vgpr0_vgpr1, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
|
|
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
|
|
# Test that GLOBAL_LOAD_LDS does not increment lgkmcnt (LGKM_CNT = 0).
|
|
# GCN-LABEL: name: ds_read_global_load_lds_use_ds_data
|
|
# GCN: DS_READ_B32_gfx9
|
|
# GCN-NEXT: GLOBAL_LOAD_LDS_DWORD
|
|
# GCN-NEXT: S_WAITCNT 49279
|
|
# lgkmcnt(0)
|
|
# GCN-NEXT: V_ADD_U32_e32
|
|
---
|
|
name: ds_read_global_load_lds_use_ds_data
|
|
body: |
|
|
bb.0:
|
|
$m0 = S_MOV_B32 0
|
|
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
|
|
GLOBAL_LOAD_LDS_DWORD $vgpr2_vgpr3, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
|
|
$vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
|
|
# Test that BUFFER_LOAD_DWORD_LDS does not increment lgkmcnt.
|
|
# DS_READ increments lgkmcnt. When using the DS_READ result, we wait for lgkmcnt(0).
|
|
# GCN-LABEL: name: ds_read_buffer_load_lds_use_ds_data
|
|
# GCN: DS_READ_B32_gfx9
|
|
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
|
|
# GCN-NEXT: S_WAITCNT 49279
|
|
# lgkmcnt(0)
|
|
# GCN-NEXT: V_ADD_U32_e32
|
|
---
|
|
name: ds_read_buffer_load_lds_use_ds_data
|
|
body: |
|
|
bb.0:
|
|
$m0 = S_MOV_B32 0
|
|
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
|
|
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
|
|
$vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
|
|
# GCN-LABEL: name: scratch_load_lds_dword_ds_read
|
|
# GCN: SCRATCH_LOAD_LDS_DWORD
|
|
# GCN-NEXT: S_WAITCNT 3952
|
|
# vmcnt(0)
|
|
# GCN-NEXT: DS_READ_B32_gfx9
|
|
---
|
|
name: scratch_load_lds_dword_ds_read
|
|
body: |
|
|
bb.0:
|
|
$m0 = S_MOV_B32 0
|
|
SCRATCH_LOAD_LDS_DWORD $vgpr0, 4, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(5) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
|
|
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
|
|
# GCN-LABEL: name: buffer_store_lds_dword_ds_read
|
|
# GCN: BUFFER_STORE_LDS_DWORD
|
|
# GCN-NEXT: DS_READ_B32_gfx9
|
|
---
|
|
name: buffer_store_lds_dword_ds_read
|
|
body: |
|
|
bb.0:
|
|
$m0 = S_MOV_B32 0
|
|
BUFFER_STORE_LDS_DWORD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(3) poison` + 4), (store (s32) into `ptr addrspace(1) poison` + 4)
|
|
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
|
|
# No need to wait before load from VMEM to LDS.
|
|
# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read
|
|
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
|
|
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
|
|
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
|
|
# GCN-NEXT: S_WAITCNT 3952
|
|
# vmcnt(0)
|
|
# GCN-NEXT: DS_READ_B32_gfx9
|
|
---
|
|
name: series_of_buffer_load_dword_lds_ds_read
|
|
body: |
|
|
bb.0:
|
|
$m0 = S_MOV_B32 0
|
|
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
|
|
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
|
|
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
|
|
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
|
|
S_ENDPGM 0
|
|
|
|
...
|