[AMDGPU] Global and Buffer loads to LDS should not increase lgkmcnt (#179305)
`global_load_lds` and `buffer_load to lds` do only increment `vmcnt` and not touch `lgkmcnt`. This causes invalid `waitcnts` for some Triton kernels, similar to the added lit tests. Note that the change for buffer ops is not necesssary, i.e. the lit test passes even before this PR, because it seems like `SIInsertWaitcnts` does not use `LGKM_CNT` for buffer ops. But this change might prevent a bug in the future.
This commit is contained in:
parent
77034cd325
commit
3b16468814
@ -509,7 +509,7 @@ class MUBUF_Load_Pseudo <string opName,
|
||||
let AsmMatchConverter = "cvtMubuf";
|
||||
|
||||
let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
|
||||
let LGKM_CNT = isLds;
|
||||
let LGKM_CNT = 0;
|
||||
let has_vdata = !not(!or(isLds, isLdsOpc));
|
||||
let mayLoad = 1;
|
||||
let mayStore = isLds;
|
||||
|
||||
@ -398,7 +398,7 @@ class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsy
|
||||
!if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)),
|
||||
(ins flat_offset:$offset, CPol_0:$cpol)),
|
||||
!if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
|
||||
let LGKM_CNT = !not(IsAsync);
|
||||
let LGKM_CNT = 0;
|
||||
let VM_CNT = !not(IsAsync);
|
||||
let ASYNC_CNT = IsAsync;
|
||||
let is_flat_global = 1;
|
||||
|
||||
@ -54,7 +54,6 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
|
||||
; GFX90A-NEXT: s_mov_b32 m0, s4
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_load_dword v0, s[2:3] offset:32 slc lds
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
|
||||
@ -67,6 +67,45 @@ body: |
|
||||
|
||||
...
|
||||
|
||||
# Test that GLOBAL_LOAD_LDS does not increment lgkmcnt (LGKM_CNT = 0).
|
||||
# GCN-LABEL: name: ds_read_global_load_lds_use_ds_data
|
||||
# GCN: DS_READ_B32_gfx9
|
||||
# GCN-NEXT: GLOBAL_LOAD_LDS_DWORD
|
||||
# GCN-NEXT: S_WAITCNT 49279
|
||||
# lgkmcnt(0)
|
||||
# GCN-NEXT: V_ADD_U32_e32
|
||||
---
|
||||
name: ds_read_global_load_lds_use_ds_data
|
||||
body: |
|
||||
bb.0:
|
||||
$m0 = S_MOV_B32 0
|
||||
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
|
||||
GLOBAL_LOAD_LDS_DWORD $vgpr2_vgpr3, 4, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
|
||||
$vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
|
||||
# Test that BUFFER_LOAD_DWORD_LDS does not increment lgkmcnt.
|
||||
# DS_READ increments lgkmcnt. When using the DS_READ result, we wait for lgkmcnt(0).
|
||||
# GCN-LABEL: name: ds_read_buffer_load_lds_use_ds_data
|
||||
# GCN: DS_READ_B32_gfx9
|
||||
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
|
||||
# GCN-NEXT: S_WAITCNT 49279
|
||||
# lgkmcnt(0)
|
||||
# GCN-NEXT: V_ADD_U32_e32
|
||||
---
|
||||
name: ds_read_buffer_load_lds_use_ds_data
|
||||
body: |
|
||||
bb.0:
|
||||
$m0 = S_MOV_B32 0
|
||||
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
|
||||
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
|
||||
$vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: scratch_load_lds_dword_ds_read
|
||||
# GCN: SCRATCH_LOAD_LDS_DWORD
|
||||
# GCN-NEXT: S_WAITCNT 3952
|
||||
|
||||
@ -327,7 +327,7 @@ define void @global_load_lds_dword_saddr_and_vaddr(ptr addrspace(1) nocapture in
|
||||
; GFX942-NEXT: s_mov_b32 m0, s2
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_load_lds_dword v1, s[0:1] offset:48 sc1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%voffset.64 = zext i32 %voffset to i64
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user