[AMDGPU] Insert waitcnt for non-global fence release in GFX12 (#159282)

A fence release could be followed by a barrier, so it should wait for
the relevant memory accesses to complete, even if it is mmra-limited to
LDS. So far, that would be skipped for non-global fence releases.

Fixes SWDEV-554932.
This commit is contained in:
Fabian Ritter 2025-09-23 11:52:38 +02:00 committed by GitHub
parent b6c061e6a9
commit 3f8c7e9fa3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 187 additions and 38 deletions

View File

@ -2514,6 +2514,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
Position Pos) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
@ -2521,53 +2523,51 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
// writeback as all memory operations by the same thread are
// sequentially consistent, and no other thread can access scratch
// memory.
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
if (Pos == Position::AFTER)
++MI;
// Other address spaces do not have a cache.
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
return false;
if (Pos == Position::AFTER)
++MI;
// global_wb is only necessary at system scope for GFX12.0,
// they're also necessary at device scope for GFX12.5.
//
// Emitting it for lower scopes is a slow no-op, so we omit it
// for performance.
switch (Scope) {
case SIAtomicScope::SYSTEM:
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
.addImm(AMDGPU::CPol::SCOPE_SYS);
break;
case SIAtomicScope::AGENT:
// TODO DOCS
if (ST.hasGFX1250Insts()) {
// global_wb is only necessary at system scope for GFX12.0,
// they're also necessary at device scope for GFX12.5.
//
// Emitting it for lower scopes is a slow no-op, so we omit it
// for performance.
switch (Scope) {
case SIAtomicScope::SYSTEM:
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
.addImm(AMDGPU::CPol::SCOPE_DEV);
.addImm(AMDGPU::CPol::SCOPE_SYS);
Changed = true;
break;
case SIAtomicScope::AGENT:
// TODO DOCS
if (ST.hasGFX1250Insts()) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
.addImm(AMDGPU::CPol::SCOPE_DEV);
Changed = true;
}
break;
case SIAtomicScope::CLUSTER:
case SIAtomicScope::WORKGROUP:
// No WB necessary, but we still have to wait.
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// No WB or wait necessary here, but insertWait takes care of that.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
break;
case SIAtomicScope::CLUSTER:
case SIAtomicScope::WORKGROUP:
// No WB necessary, but we still have to wait.
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// No WB or wait necessary here.
return false;
default:
llvm_unreachable("Unsupported synchronization scope");
}
if (Pos == Position::AFTER)
--MI;
if (Pos == Position::AFTER)
--MI;
}
// We always have to wait for previous memory operations (load/store) to
// complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
// we of course need to wait for that as well.
insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
return true;
return Changed;
}
bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(

View File

@ -0,0 +1,122 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define float @test_barrier_workgroup_local_mmra(ptr addrspace(3) noundef %x, ptr addrspace(3) noundef %y, float %val) {
; GFX10-WGP-LABEL: test_barrier_workgroup_local_mmra:
; GFX10-WGP: ; %bb.0:
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: ds_write_b32 v0, v2
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_barrier
; GFX10-WGP-NEXT: ds_read_b32 v0, v1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-WGP-LABEL: test_barrier_workgroup_local_mmra:
; GFX11-WGP: ; %bb.0:
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: ds_store_b32 v0, v2
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_barrier
; GFX11-WGP-NEXT: ds_load_b32 v0, v1
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-WGP-LABEL: test_barrier_workgroup_local_mmra:
; GFX12-WGP: ; %bb.0:
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: s_wait_expcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: ds_store_b32 v0, v2
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: s_barrier_signal -1
; GFX12-WGP-NEXT: s_barrier_wait -1
; GFX12-WGP-NEXT: ds_load_b32 v0, v1
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: test_barrier_workgroup_local_mmra:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: ds_store_b32 v0, v2
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_barrier_signal -1
; GFX1250-NEXT: s_barrier_wait -1
; GFX1250-NEXT: ds_load_b32 v0, v1
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
store float %val, ptr addrspace(3) %x
fence syncscope("workgroup") release, !mmra !0
tail call void @llvm.amdgcn.s.barrier()
fence syncscope("workgroup") acquire, !mmra !0
%ret = load float, ptr addrspace(3) %y
ret float %ret
}
define float @test_barrier_workgroup_global_mmra(ptr addrspace(1) noundef %x, ptr addrspace(1) noundef %y, float %val) {
; GFX10-WGP-LABEL: test_barrier_workgroup_global_mmra:
; GFX10-WGP: ; %bb.0:
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v[0:1], v4, off
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: s_barrier
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_load_dword v0, v[2:3], off
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-WGP-LABEL: test_barrier_workgroup_global_mmra:
; GFX11-WGP: ; %bb.0:
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v[0:1], v4, off
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_barrier
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_load_b32 v0, v[2:3], off
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-WGP-LABEL: test_barrier_workgroup_global_mmra:
; GFX12-WGP: ; %bb.0:
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: s_wait_expcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v[0:1], v4, off
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_barrier_signal -1
; GFX12-WGP-NEXT: s_barrier_wait -1
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: global_load_b32 v0, v[2:3], off
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: test_barrier_workgroup_global_mmra:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_store_b32 v[0:1], v4, off
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_barrier_signal -1
; GFX1250-NEXT: s_barrier_wait -1
; GFX1250-NEXT: global_load_b32 v0, v[2:3], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
store float %val, ptr addrspace(1) %x
fence syncscope("workgroup") release, !mmra !1
tail call void @llvm.amdgcn.s.barrier()
fence syncscope("workgroup") acquire, !mmra !1
%ret = load float, ptr addrspace(1) %y
ret float %ret
}
!0 = !{!"amdgpu-synchronize-as", !"local"}
!1 = !{!"amdgpu-synchronize-as", !"global"}

View File

@ -143,14 +143,17 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX12-WGP-LABEL: workgroup_release_fence:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: workgroup_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_release_fence:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@ -213,14 +216,17 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: workgroup_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_acq_rel_fence:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@ -283,14 +289,17 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: workgroup_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_seq_cst_fence:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
@ -670,14 +679,17 @@ define amdgpu_kernel void @agent_release_fence() {
;
; GFX12-WGP-LABEL: agent_release_fence:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: agent_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: agent_release_fence:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@ -740,14 +752,17 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
;
; GFX12-WGP-LABEL: agent_acq_rel_fence:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: agent_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: agent_acq_rel_fence:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@ -810,14 +825,17 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
;
; GFX12-WGP-LABEL: agent_seq_cst_fence:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: agent_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: agent_seq_cst_fence:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
@ -1197,14 +1215,17 @@ define amdgpu_kernel void @system_release_fence() {
;
; GFX12-WGP-LABEL: system_release_fence:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: system_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: system_release_fence:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
entry:
fence release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@ -1267,14 +1288,17 @@ define amdgpu_kernel void @system_acq_rel_fence() {
;
; GFX12-WGP-LABEL: system_acq_rel_fence:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: system_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: system_acq_rel_fence:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
entry:
fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@ -1337,14 +1361,17 @@ define amdgpu_kernel void @system_seq_cst_fence() {
;
; GFX12-WGP-LABEL: system_seq_cst_fence:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: system_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: system_seq_cst_fence:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
entry:
fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}