[AMDGPU][gfx12] Clean-up implementation of waits before SCOPE_SYS stores (#150587)
We can do it all in finalizeStore if we ensure it always sees the stores. For that, I needed to fix a hidden bug where finalizeStore wouldn't see all stores because sometimes the iterator got out-of-sync and didn't point to the store anymore. This also removes the waits before volatile LDS stores which never needed it, that was a bug until now.
This commit is contained in:
parent
c8a091e1b6
commit
a6532c2ada
@ -321,8 +321,7 @@ public:
|
||||
bool IsNonTemporal,
|
||||
bool IsLastUse = false) const = 0;
|
||||
|
||||
virtual bool finalizeStore(MachineBasicBlock::iterator &MI,
|
||||
bool Atomic) const {
|
||||
virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
|
||||
return false;
|
||||
};
|
||||
|
||||
@ -603,8 +602,7 @@ public:
|
||||
bool IsVolatile, bool IsNonTemporal,
|
||||
bool IsLastUse) const override;
|
||||
|
||||
bool finalizeStore(MachineBasicBlock::iterator &MI,
|
||||
bool Atomic) const override;
|
||||
bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
|
||||
|
||||
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
|
||||
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
|
||||
@ -2538,9 +2536,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
|
||||
if (IsVolatile) {
|
||||
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
|
||||
|
||||
if (Op == SIMemOp::STORE)
|
||||
Changed |= insertWaitsBeforeSystemScopeStore(MI);
|
||||
|
||||
// Ensure operation has completed at system scope to cause all volatile
|
||||
// operations to be visible outside the program in a global order. Do not
|
||||
// request cross address space as only the global address space can be
|
||||
@ -2553,9 +2548,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
|
||||
bool Atomic) const {
|
||||
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
|
||||
bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
|
||||
MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
|
||||
if (!CPol)
|
||||
return false;
|
||||
|
||||
@ -2570,7 +2564,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
|
||||
|
||||
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
|
||||
// space.
|
||||
if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU)
|
||||
if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU)
|
||||
return setScope(MI, CPol::SCOPE_SE);
|
||||
|
||||
return false;
|
||||
@ -2674,6 +2668,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
|
||||
assert(!MI->mayLoad() && MI->mayStore());
|
||||
|
||||
bool Changed = false;
|
||||
// FIXME: Necessary hack because iterator can lose track of the store.
|
||||
MachineInstr &StoreMI = *MI;
|
||||
|
||||
if (MOI.isAtomic()) {
|
||||
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
|
||||
@ -2690,7 +2686,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
|
||||
MOI.getIsCrossAddressSpaceOrdering(),
|
||||
Position::BEFORE);
|
||||
|
||||
Changed |= CC->finalizeStore(MI, /*Atomic=*/true);
|
||||
Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
|
||||
return Changed;
|
||||
}
|
||||
|
||||
@ -2703,7 +2699,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
|
||||
|
||||
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
|
||||
// instruction field, do not confuse it with atomic scope.
|
||||
Changed |= CC->finalizeStore(MI, /*Atomic=*/false);
|
||||
Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
|
||||
return Changed;
|
||||
}
|
||||
|
||||
|
@ -141,7 +141,6 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
|
||||
; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
|
||||
; GFX1250-NEXT: s_wait_storecnt 0x0
|
||||
; GFX1250-NEXT: global_store_b16 v[0:1], v3, off scope:SCOPE_SYS
|
||||
; GFX1250-NEXT: s_wait_storecnt 0x0
|
||||
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
||||
|
@ -415,11 +415,6 @@ define amdgpu_kernel void @local_volatile_store_0(
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
|
||||
; GFX12-WGP-NEXT: s_endpgm
|
||||
;
|
||||
@ -432,11 +427,6 @@ define amdgpu_kernel void @local_volatile_store_0(
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-CU-NEXT: ds_store_b32 v0, v1
|
||||
; GFX12-CU-NEXT: s_endpgm
|
||||
ptr addrspace(1) %in, ptr addrspace(3) %out) {
|
||||
@ -562,11 +552,6 @@ define amdgpu_kernel void @local_volatile_store_1(
|
||||
; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
|
||||
; GFX12-WGP-NEXT: s_endpgm
|
||||
;
|
||||
@ -583,11 +568,6 @@ define amdgpu_kernel void @local_volatile_store_1(
|
||||
; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-CU-NEXT: ds_store_b32 v0, v1
|
||||
; GFX12-CU-NEXT: s_endpgm
|
||||
ptr addrspace(1) %in, ptr addrspace(3) %out) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user