From 3b16468814364d1d2ca07a2be3f92fea60ae5059 Mon Sep 17 00:00:00 2001 From: Alexander Weinrauch Date: Thu, 5 Feb 2026 17:36:00 +0000 Subject: [PATCH] [AMDGPU] Global and Buffer loads to LDS should not increase `lgkmcnt` (#179305) `global_load_lds` and `buffer_load to lds` do only increment `vmcnt` and not touch `lgkmcnt`. This causes invalid `waitcnts` for some Triton kernels, similar to the added lit tests. Note that the change for buffer ops is not necesssary, i.e. the lit test passes even before this PR, because it seems like `SIInsertWaitcnts` does not use `LGKM_CNT` for buffer ops. But this change might prevent a bug in the future. --- llvm/lib/Target/AMDGPU/BUFInstructions.td | 2 +- llvm/lib/Target/AMDGPU/FLATInstructions.td | 2 +- .../CodeGen/AMDGPU/fix-crash-valu-hazard.ll | 1 - llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir | 39 +++++++++++++++++++ .../AMDGPU/ptradd-sdag-optimizations.ll | 2 +- 5 files changed, 42 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 48904582adba..fde67e9e2d83 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -509,7 +509,7 @@ class MUBUF_Load_Pseudo { - let LGKM_CNT = !not(IsAsync); + let LGKM_CNT = 0; let VM_CNT = !not(IsAsync); let ASYNC_CNT = IsAsync; let is_flat_global = 1; diff --git a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll index 7e9f21b94bea..a526b1ec1650 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll @@ -54,7 +54,6 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt ; GFX90A-NEXT: s_mov_b32 m0, s4 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_load_dword v0, s[2:3] offset:32 slc lds -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; GFX90A-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir index 21372c06d322..0e64d0430668 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir @@ -67,6 +67,45 @@ body: | ... +# Test that GLOBAL_LOAD_LDS does not increment lgkmcnt (LGKM_CNT = 0). +# GCN-LABEL: name: ds_read_global_load_lds_use_ds_data +# GCN: DS_READ_B32_gfx9 +# GCN-NEXT: GLOBAL_LOAD_LDS_DWORD +# GCN-NEXT: S_WAITCNT 49279 +# lgkmcnt(0) +# GCN-NEXT: V_ADD_U32_e32 +--- +name: ds_read_global_load_lds_use_ds_data +body: | + bb.0: + $m0 = S_MOV_B32 0 + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + GLOBAL_LOAD_LDS_DWORD $vgpr2_vgpr3, 4, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + S_ENDPGM 0 + +... + +# Test that BUFFER_LOAD_DWORD_LDS does not increment lgkmcnt. +# DS_READ increments lgkmcnt. When using the DS_READ result, we wait for lgkmcnt(0). +# GCN-LABEL: name: ds_read_buffer_load_lds_use_ds_data +# GCN: DS_READ_B32_gfx9 +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_WAITCNT 49279 +# lgkmcnt(0) +# GCN-NEXT: V_ADD_U32_e32 +--- +name: ds_read_buffer_load_lds_use_ds_data +body: | + bb.0: + $m0 = S_MOV_B32 0 + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`) + $vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + S_ENDPGM 0 + +... + # GCN-LABEL: name: scratch_load_lds_dword_ds_read # GCN: SCRATCH_LOAD_LDS_DWORD # GCN-NEXT: S_WAITCNT 3952 diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 1c986a02e8bd..fba7720b37bf 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -327,7 +327,7 @@ define void @global_load_lds_dword_saddr_and_vaddr(ptr addrspace(1) nocapture in ; GFX942-NEXT: s_mov_b32 m0, s2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_load_lds_dword v1, s[0:1] offset:48 sc1 -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %voffset.64 = zext i32 %voffset to i64