[AMDGPU] Global and Buffer loads to LDS should not increase lgkmcnt (#179305)

`global_load_lds` and `buffer_load to lds` do only increment `vmcnt` and
not touch `lgkmcnt`. This causes invalid `waitcnts` for some Triton
kernels, similar to the added lit tests.

Note that the change for buffer ops is not necesssary, i.e. the lit test
passes even before this PR, because it seems like `SIInsertWaitcnts`
does not use `LGKM_CNT` for buffer ops. But this change might prevent a
bug in the future.
This commit is contained in:
Alexander Weinrauch 2026-02-05 17:36:00 +00:00 committed by GitHub
parent 77034cd325
commit 3b16468814
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 42 additions and 4 deletions

View File

@ -509,7 +509,7 @@ class MUBUF_Load_Pseudo <string opName,
let AsmMatchConverter = "cvtMubuf";
let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
let LGKM_CNT = isLds;
let LGKM_CNT = 0;
let has_vdata = !not(!or(isLds, isLdsOpc));
let mayLoad = 1;
let mayStore = isLds;

View File

@ -398,7 +398,7 @@ class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsy
!if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)),
(ins flat_offset:$offset, CPol_0:$cpol)),
!if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
let LGKM_CNT = !not(IsAsync);
let LGKM_CNT = 0;
let VM_CNT = !not(IsAsync);
let ASYNC_CNT = IsAsync;
let is_flat_global = 1;

View File

@ -54,7 +54,6 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
; GFX90A-NEXT: s_mov_b32 m0, s4
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_load_dword v0, s[2:3] offset:32 slc lds
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX90A-NEXT: s_endpgm
main_body:

View File

@ -67,6 +67,45 @@ body: |
...
# Test that GLOBAL_LOAD_LDS does not increment lgkmcnt (LGKM_CNT = 0).
# GCN-LABEL: name: ds_read_global_load_lds_use_ds_data
# GCN: DS_READ_B32_gfx9
# GCN-NEXT: GLOBAL_LOAD_LDS_DWORD
# GCN-NEXT: S_WAITCNT 49279
# lgkmcnt(0)
# GCN-NEXT: V_ADD_U32_e32
---
name: ds_read_global_load_lds_use_ds_data
body: |
bb.0:
$m0 = S_MOV_B32 0
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
GLOBAL_LOAD_LDS_DWORD $vgpr2_vgpr3, 4, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
S_ENDPGM 0
...
# Test that BUFFER_LOAD_DWORD_LDS does not increment lgkmcnt.
# DS_READ increments lgkmcnt. When using the DS_READ result, we wait for lgkmcnt(0).
# GCN-LABEL: name: ds_read_buffer_load_lds_use_ds_data
# GCN: DS_READ_B32_gfx9
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
# GCN-NEXT: S_WAITCNT 49279
# lgkmcnt(0)
# GCN-NEXT: V_ADD_U32_e32
---
name: ds_read_buffer_load_lds_use_ds_data
body: |
bb.0:
$m0 = S_MOV_B32 0
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
$vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
S_ENDPGM 0
...
# GCN-LABEL: name: scratch_load_lds_dword_ds_read
# GCN: SCRATCH_LOAD_LDS_DWORD
# GCN-NEXT: S_WAITCNT 3952

View File

@ -327,7 +327,7 @@ define void @global_load_lds_dword_saddr_and_vaddr(ptr addrspace(1) nocapture in
; GFX942-NEXT: s_mov_b32 m0, s2
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_load_lds_dword v1, s[0:1] offset:48 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
main_body:
%voffset.64 = zext i32 %voffset to i64