[AMDGPU] fix asyncmark soft waitcnt bug (#184851)

Asyncmarks record the current wait state and so should not allow waitcnts that occur after them to be merged into waitcnts that occur before.
This commit is contained in:
Ryan Mitchell 2026-03-06 11:38:32 -08:00 committed by GitHub
parent 097122cb90
commit 918d0feb76
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 147 additions and 15 deletions

View File

@ -3247,18 +3247,22 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
if (Block.getFirstTerminator() == Inst)
FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
// FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
assert(ST->getGeneration() < AMDGPUSubtarget::GFX12);
ScoreBrackets.recordAsyncMark(Inst);
continue;
}
// Generate an s_waitcnt instruction to be placed before Inst, if needed.
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
FlushFlags);
OldWaitcntInstr = nullptr;
if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
// FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
//
// Asyncmarks record the current wait state and so should not allow
// waitcnts that occur after them to be merged into waitcnts that occur
// before.
assert(ST->getGeneration() < AMDGPUSubtarget::GFX12);
ScoreBrackets.recordAsyncMark(Inst);
continue;
}
if (TII->isSMRD(Inst)) {
for (const MachineMemOperand *Memop : Inst.memoperands()) {
// No need to handle invariant loads when avoiding WAR conflicts, as

View File

@ -246,6 +246,109 @@ entry:
ret void
}
; Tests that a fence that inserts waits can be used with asyncmark.
define void @fence_with_asyncmark(ptr addrspace(8) inreg %buf, ptr addrspace(1) %foo, ptr addrspace(3) inreg %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out) {
; WITHASYNC-LABEL: fence_with_asyncmark:
; WITHASYNC: ; %bb.0: ; %entry
; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; WITHASYNC-NEXT: s_mov_b32 m0, s20
; WITHASYNC-NEXT: global_load_dword v6, v[2:3], off
; WITHASYNC-NEXT: global_load_dword v7, v[0:1], off
; WITHASYNC-NEXT: v_mov_b32_e32 v8, 0x54
; WITHASYNC-NEXT: ; wave barrier
; WITHASYNC-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds
; WITHASYNC-NEXT: s_waitcnt vmcnt(0)
; WITHASYNC-NEXT: ; asyncmark
; WITHASYNC-NEXT: global_load_dword v0, v[0:1], off
; WITHASYNC-NEXT: v_mov_b32_e32 v1, 0x58
; WITHASYNC-NEXT: ; wave barrier
; WITHASYNC-NEXT: buffer_load_dword v1, s[16:19], 0 offen lds
; WITHASYNC-NEXT: ; wave barrier
; WITHASYNC-NEXT: global_load_dword v1, v[2:3], off
; WITHASYNC-NEXT: v_mov_b32_e32 v2, s20
; WITHASYNC-NEXT: ; asyncmark
; WITHASYNC-NEXT: ; wait_asyncmark(1)
; WITHASYNC-NEXT: ds_read_b32 v3, v2
; WITHASYNC-NEXT: ; wait_asyncmark(0)
; WITHASYNC-NEXT: s_waitcnt vmcnt(1)
; WITHASYNC-NEXT: ds_read_b32 v2, v2
; WITHASYNC-NEXT: v_add_u32_e32 v6, v7, v6
; WITHASYNC-NEXT: s_waitcnt lgkmcnt(1)
; WITHASYNC-NEXT: v_add3_u32 v0, v6, v3, v0
; WITHASYNC-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; WITHASYNC-NEXT: v_add3_u32 v0, v0, v1, v2
; WITHASYNC-NEXT: global_store_dword v[4:5], v0, off
; WITHASYNC-NEXT: s_waitcnt vmcnt(0)
; WITHASYNC-NEXT: s_setpc_b64 s[30:31]
;
; WITHOUT-LABEL: fence_with_asyncmark:
; WITHOUT: ; %bb.0: ; %entry
; WITHOUT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; WITHOUT-NEXT: s_mov_b32 m0, s20
; WITHOUT-NEXT: global_load_dword v6, v[2:3], off
; WITHOUT-NEXT: global_load_dword v7, v[0:1], off
; WITHOUT-NEXT: v_mov_b32_e32 v8, 0x54
; WITHOUT-NEXT: ; wave barrier
; WITHOUT-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds
; WITHOUT-NEXT: s_waitcnt vmcnt(0)
; WITHOUT-NEXT: ; asyncmark
; WITHOUT-NEXT: global_load_dword v0, v[0:1], off
; WITHOUT-NEXT: v_mov_b32_e32 v1, 0x58
; WITHOUT-NEXT: ; wave barrier
; WITHOUT-NEXT: buffer_load_dword v1, s[16:19], 0 offen lds
; WITHOUT-NEXT: ; wave barrier
; WITHOUT-NEXT: global_load_dword v1, v[2:3], off
; WITHOUT-NEXT: v_mov_b32_e32 v2, s20
; WITHOUT-NEXT: ; asyncmark
; WITHOUT-NEXT: ; wait_asyncmark(1)
; WITHOUT-NEXT: s_waitcnt vmcnt(1)
; WITHOUT-NEXT: ds_read_b32 v3, v2
; WITHOUT-NEXT: ; wait_asyncmark(0)
; WITHOUT-NEXT: ds_read_b32 v2, v2
; WITHOUT-NEXT: v_add_u32_e32 v6, v7, v6
; WITHOUT-NEXT: s_waitcnt lgkmcnt(1)
; WITHOUT-NEXT: v_add3_u32 v0, v6, v3, v0
; WITHOUT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; WITHOUT-NEXT: v_add3_u32 v0, v0, v1, v2
; WITHOUT-NEXT: global_store_dword v[4:5], v0, off
; WITHOUT-NEXT: s_waitcnt vmcnt(0)
; WITHOUT-NEXT: s_setpc_b64 s[30:31]
entry:
; First batch: global load, global load, async global-to-LDS.
%bar_v11 = load i32, ptr addrspace(1) %bar
%foo_v1 = load i32, ptr addrspace(1) %foo
call void @llvm.amdgcn.wave.barrier()
call void @llvm.amdgcn.raw.ptr.buffer.load.ASYNC.lds(ptr addrspace(8) %buf, ptr addrspace(3) %lds, i32 4, i32 84, i32 0, i32 0, i32 0)
fence release
call void @llvm.amdgcn.asyncmark()
; Second batch: global load, async global-to-LDS, global load.
%foo_v2 = load i32, ptr addrspace(1) %foo
call void @llvm.amdgcn.wave.barrier()
call void @llvm.amdgcn.raw.ptr.buffer.load.ASYNC.lds(ptr addrspace(8) %buf, ptr addrspace(3) %lds, i32 4, i32 88, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.wave.barrier()
%bar_v12 = load i32, ptr addrspace(1) %bar
call void @llvm.amdgcn.asyncmark()
; Wait for first async mark and read from LDS.
; This results in vmcnt(3) corresponding to the second batch.
call void @llvm.amdgcn.wait.asyncmark(i16 1)
%lds_val21 = load i32, ptr addrspace(3) %lds
; Wait for the next lds dma.
; This results in vmcnt(1) because the last global load is not async.
call void @llvm.amdgcn.wait.asyncmark(i16 0)
%lds_val22 = load i32, ptr addrspace(3) %lds
%sum1 = add i32 %foo_v1, %bar_v11
%sum2 = add i32 %sum1, %lds_val21
%sum3 = add i32 %sum2, %foo_v2
%sum4 = add i32 %sum3, %bar_v12
%sum5 = add i32 %sum4, %lds_val22
store i32 %sum5, ptr addrspace(1) %out
ret void
}
; A perfect loop that is unlikely to exist in real life. It uses only async LDS
; DMA operations, and result in vmcnt waits that exactly match the stream of
; those outstanding operations.
@ -263,7 +366,7 @@ define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, p
; WITHASYNC-NEXT: s_mov_b32 s6, 2
; WITHASYNC-NEXT: s_mov_b64 s[4:5], 0
; WITHASYNC-NEXT: ; asyncmark
; WITHASYNC-NEXT: .LBB3_1: ; %loop_body
; WITHASYNC-NEXT: .LBB4_1: ; %loop_body
; WITHASYNC-NEXT: ; =>This Inner Loop Header: Depth=1
; WITHASYNC-NEXT: v_readfirstlane_b32 s7, v2
; WITHASYNC-NEXT: s_mov_b32 m0, s7
@ -278,7 +381,7 @@ define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, p
; WITHASYNC-NEXT: s_waitcnt lgkmcnt(0)
; WITHASYNC-NEXT: v_add_u32_e32 v5, v5, v6
; WITHASYNC-NEXT: s_andn2_b64 exec, exec, s[4:5]
; WITHASYNC-NEXT: s_cbranch_execnz .LBB3_1
; WITHASYNC-NEXT: s_cbranch_execnz .LBB4_1
; WITHASYNC-NEXT: ; %bb.2: ; %epilog
; WITHASYNC-NEXT: s_or_b64 exec, exec, s[4:5]
; WITHASYNC-NEXT: ; wait_asyncmark(1)
@ -303,7 +406,7 @@ define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, p
; WITHOUT-NEXT: s_mov_b32 s6, 2
; WITHOUT-NEXT: s_mov_b64 s[4:5], 0
; WITHOUT-NEXT: ; asyncmark
; WITHOUT-NEXT: .LBB3_1: ; %loop_body
; WITHOUT-NEXT: .LBB4_1: ; %loop_body
; WITHOUT-NEXT: ; =>This Inner Loop Header: Depth=1
; WITHOUT-NEXT: v_readfirstlane_b32 s7, v2
; WITHOUT-NEXT: s_mov_b32 m0, s7
@ -318,7 +421,7 @@ define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, p
; WITHOUT-NEXT: s_waitcnt lgkmcnt(0)
; WITHOUT-NEXT: v_add_u32_e32 v5, v5, v6
; WITHOUT-NEXT: s_andn2_b64 exec, exec, s[4:5]
; WITHOUT-NEXT: s_cbranch_execnz .LBB3_1
; WITHOUT-NEXT: s_cbranch_execnz .LBB4_1
; WITHOUT-NEXT: ; %bb.2: ; %epilog
; WITHOUT-NEXT: s_or_b64 exec, exec, s[4:5]
; WITHOUT-NEXT: ; wait_asyncmark(1)
@ -395,7 +498,7 @@ define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspac
; WITHASYNC-NEXT: v_mov_b32_e32 v13, v8
; WITHASYNC-NEXT: s_waitcnt vmcnt(1)
; WITHASYNC-NEXT: v_mov_b32_e32 v15, v9
; WITHASYNC-NEXT: .LBB4_1: ; %loop_body
; WITHASYNC-NEXT: .LBB5_1: ; %loop_body
; WITHASYNC-NEXT: ; =>This Inner Loop Header: Depth=1
; WITHASYNC-NEXT: v_readfirstlane_b32 s7, v2
; WITHASYNC-NEXT: s_waitcnt vmcnt(1)
@ -415,7 +518,7 @@ define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspac
; WITHASYNC-NEXT: ; asyncmark
; WITHASYNC-NEXT: ; wait_asyncmark(2)
; WITHASYNC-NEXT: s_andn2_b64 exec, exec, s[4:5]
; WITHASYNC-NEXT: s_cbranch_execnz .LBB4_1
; WITHASYNC-NEXT: s_cbranch_execnz .LBB5_1
; WITHASYNC-NEXT: ; %bb.2: ; %epilog
; WITHASYNC-NEXT: s_or_b64 exec, exec, s[4:5]
; WITHASYNC-NEXT: ds_read_b32 v0, v2
@ -456,7 +559,7 @@ define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspac
; WITHOUT-NEXT: v_mov_b32_e32 v13, v8
; WITHOUT-NEXT: s_waitcnt vmcnt(1)
; WITHOUT-NEXT: v_mov_b32_e32 v15, v9
; WITHOUT-NEXT: .LBB4_1: ; %loop_body
; WITHOUT-NEXT: .LBB5_1: ; %loop_body
; WITHOUT-NEXT: ; =>This Inner Loop Header: Depth=1
; WITHOUT-NEXT: v_readfirstlane_b32 s7, v2
; WITHOUT-NEXT: s_waitcnt vmcnt(1)
@ -476,7 +579,7 @@ define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspac
; WITHOUT-NEXT: ; asyncmark
; WITHOUT-NEXT: ; wait_asyncmark(2)
; WITHOUT-NEXT: s_andn2_b64 exec, exec, s[4:5]
; WITHOUT-NEXT: s_cbranch_execnz .LBB4_1
; WITHOUT-NEXT: s_cbranch_execnz .LBB5_1
; WITHOUT-NEXT: ; %bb.2: ; %epilog
; WITHOUT-NEXT: s_or_b64 exec, exec, s[4:5]
; WITHOUT-NEXT: s_waitcnt vmcnt(0)

View File

@ -0,0 +1,25 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=si-insert-waitcnts -o - %s | FileCheck %s
--- |
define void @asyncmark_preexisting_waitcnt() { unreachable }
...
---
name: asyncmark_preexisting_waitcnt
tracksRegLiveness: true
machineFunctionInfo:
occupancy: 16
body: |
bb.0:
liveins: $vgpr0
; CHECK-LABEL: name: asyncmark_preexisting_waitcnt
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_WAITCNT 0
; CHECK-NEXT: ASYNCMARK
; CHECK-NEXT: S_ENDPGM 0
S_WAITCNT_soft 0
ASYNCMARK
S_ENDPGM 0
...