[AMDGPU] fix asyncmark soft waitcnt bug (#184851)
Asyncmarks record the current wait state and so should not allow waitcnts that occur after them to be merged into waitcnts that occur before.
This commit is contained in:
parent
097122cb90
commit
918d0feb76
@ -3247,18 +3247,22 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
|
||||
if (Block.getFirstTerminator() == Inst)
|
||||
FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
|
||||
|
||||
if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
|
||||
// FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
|
||||
assert(ST->getGeneration() < AMDGPUSubtarget::GFX12);
|
||||
ScoreBrackets.recordAsyncMark(Inst);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Generate an s_waitcnt instruction to be placed before Inst, if needed.
|
||||
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
|
||||
FlushFlags);
|
||||
OldWaitcntInstr = nullptr;
|
||||
|
||||
if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
|
||||
// FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
|
||||
//
|
||||
// Asyncmarks record the current wait state and so should not allow
|
||||
// waitcnts that occur after them to be merged into waitcnts that occur
|
||||
// before.
|
||||
assert(ST->getGeneration() < AMDGPUSubtarget::GFX12);
|
||||
ScoreBrackets.recordAsyncMark(Inst);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (TII->isSMRD(Inst)) {
|
||||
for (const MachineMemOperand *Memop : Inst.memoperands()) {
|
||||
// No need to handle invariant loads when avoiding WAR conflicts, as
|
||||
|
||||
@ -246,6 +246,109 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Tests that a fence that inserts waits can be used with asyncmark.
|
||||
define void @fence_with_asyncmark(ptr addrspace(8) inreg %buf, ptr addrspace(1) %foo, ptr addrspace(3) inreg %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out) {
|
||||
; WITHASYNC-LABEL: fence_with_asyncmark:
|
||||
; WITHASYNC: ; %bb.0: ; %entry
|
||||
; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; WITHASYNC-NEXT: s_mov_b32 m0, s20
|
||||
; WITHASYNC-NEXT: global_load_dword v6, v[2:3], off
|
||||
; WITHASYNC-NEXT: global_load_dword v7, v[0:1], off
|
||||
; WITHASYNC-NEXT: v_mov_b32_e32 v8, 0x54
|
||||
; WITHASYNC-NEXT: ; wave barrier
|
||||
; WITHASYNC-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds
|
||||
; WITHASYNC-NEXT: s_waitcnt vmcnt(0)
|
||||
; WITHASYNC-NEXT: ; asyncmark
|
||||
; WITHASYNC-NEXT: global_load_dword v0, v[0:1], off
|
||||
; WITHASYNC-NEXT: v_mov_b32_e32 v1, 0x58
|
||||
; WITHASYNC-NEXT: ; wave barrier
|
||||
; WITHASYNC-NEXT: buffer_load_dword v1, s[16:19], 0 offen lds
|
||||
; WITHASYNC-NEXT: ; wave barrier
|
||||
; WITHASYNC-NEXT: global_load_dword v1, v[2:3], off
|
||||
; WITHASYNC-NEXT: v_mov_b32_e32 v2, s20
|
||||
; WITHASYNC-NEXT: ; asyncmark
|
||||
; WITHASYNC-NEXT: ; wait_asyncmark(1)
|
||||
; WITHASYNC-NEXT: ds_read_b32 v3, v2
|
||||
; WITHASYNC-NEXT: ; wait_asyncmark(0)
|
||||
; WITHASYNC-NEXT: s_waitcnt vmcnt(1)
|
||||
; WITHASYNC-NEXT: ds_read_b32 v2, v2
|
||||
; WITHASYNC-NEXT: v_add_u32_e32 v6, v7, v6
|
||||
; WITHASYNC-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; WITHASYNC-NEXT: v_add3_u32 v0, v6, v3, v0
|
||||
; WITHASYNC-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; WITHASYNC-NEXT: v_add3_u32 v0, v0, v1, v2
|
||||
; WITHASYNC-NEXT: global_store_dword v[4:5], v0, off
|
||||
; WITHASYNC-NEXT: s_waitcnt vmcnt(0)
|
||||
; WITHASYNC-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; WITHOUT-LABEL: fence_with_asyncmark:
|
||||
; WITHOUT: ; %bb.0: ; %entry
|
||||
; WITHOUT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; WITHOUT-NEXT: s_mov_b32 m0, s20
|
||||
; WITHOUT-NEXT: global_load_dword v6, v[2:3], off
|
||||
; WITHOUT-NEXT: global_load_dword v7, v[0:1], off
|
||||
; WITHOUT-NEXT: v_mov_b32_e32 v8, 0x54
|
||||
; WITHOUT-NEXT: ; wave barrier
|
||||
; WITHOUT-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds
|
||||
; WITHOUT-NEXT: s_waitcnt vmcnt(0)
|
||||
; WITHOUT-NEXT: ; asyncmark
|
||||
; WITHOUT-NEXT: global_load_dword v0, v[0:1], off
|
||||
; WITHOUT-NEXT: v_mov_b32_e32 v1, 0x58
|
||||
; WITHOUT-NEXT: ; wave barrier
|
||||
; WITHOUT-NEXT: buffer_load_dword v1, s[16:19], 0 offen lds
|
||||
; WITHOUT-NEXT: ; wave barrier
|
||||
; WITHOUT-NEXT: global_load_dword v1, v[2:3], off
|
||||
; WITHOUT-NEXT: v_mov_b32_e32 v2, s20
|
||||
; WITHOUT-NEXT: ; asyncmark
|
||||
; WITHOUT-NEXT: ; wait_asyncmark(1)
|
||||
; WITHOUT-NEXT: s_waitcnt vmcnt(1)
|
||||
; WITHOUT-NEXT: ds_read_b32 v3, v2
|
||||
; WITHOUT-NEXT: ; wait_asyncmark(0)
|
||||
; WITHOUT-NEXT: ds_read_b32 v2, v2
|
||||
; WITHOUT-NEXT: v_add_u32_e32 v6, v7, v6
|
||||
; WITHOUT-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; WITHOUT-NEXT: v_add3_u32 v0, v6, v3, v0
|
||||
; WITHOUT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; WITHOUT-NEXT: v_add3_u32 v0, v0, v1, v2
|
||||
; WITHOUT-NEXT: global_store_dword v[4:5], v0, off
|
||||
; WITHOUT-NEXT: s_waitcnt vmcnt(0)
|
||||
; WITHOUT-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
; First batch: global load, global load, async global-to-LDS.
|
||||
%bar_v11 = load i32, ptr addrspace(1) %bar
|
||||
%foo_v1 = load i32, ptr addrspace(1) %foo
|
||||
call void @llvm.amdgcn.wave.barrier()
|
||||
call void @llvm.amdgcn.raw.ptr.buffer.load.ASYNC.lds(ptr addrspace(8) %buf, ptr addrspace(3) %lds, i32 4, i32 84, i32 0, i32 0, i32 0)
|
||||
fence release
|
||||
call void @llvm.amdgcn.asyncmark()
|
||||
|
||||
; Second batch: global load, async global-to-LDS, global load.
|
||||
%foo_v2 = load i32, ptr addrspace(1) %foo
|
||||
call void @llvm.amdgcn.wave.barrier()
|
||||
call void @llvm.amdgcn.raw.ptr.buffer.load.ASYNC.lds(ptr addrspace(8) %buf, ptr addrspace(3) %lds, i32 4, i32 88, i32 0, i32 0, i32 0)
|
||||
call void @llvm.amdgcn.wave.barrier()
|
||||
%bar_v12 = load i32, ptr addrspace(1) %bar
|
||||
call void @llvm.amdgcn.asyncmark()
|
||||
|
||||
; Wait for first async mark and read from LDS.
|
||||
; This results in vmcnt(3) corresponding to the second batch.
|
||||
call void @llvm.amdgcn.wait.asyncmark(i16 1)
|
||||
%lds_val21 = load i32, ptr addrspace(3) %lds
|
||||
|
||||
; Wait for the next lds dma.
|
||||
; This results in vmcnt(1) because the last global load is not async.
|
||||
call void @llvm.amdgcn.wait.asyncmark(i16 0)
|
||||
%lds_val22 = load i32, ptr addrspace(3) %lds
|
||||
%sum1 = add i32 %foo_v1, %bar_v11
|
||||
%sum2 = add i32 %sum1, %lds_val21
|
||||
%sum3 = add i32 %sum2, %foo_v2
|
||||
%sum4 = add i32 %sum3, %bar_v12
|
||||
%sum5 = add i32 %sum4, %lds_val22
|
||||
store i32 %sum5, ptr addrspace(1) %out
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; A perfect loop that is unlikely to exist in real life. It uses only async LDS
|
||||
; DMA operations, and result in vmcnt waits that exactly match the stream of
|
||||
; those outstanding operations.
|
||||
@ -263,7 +366,7 @@ define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, p
|
||||
; WITHASYNC-NEXT: s_mov_b32 s6, 2
|
||||
; WITHASYNC-NEXT: s_mov_b64 s[4:5], 0
|
||||
; WITHASYNC-NEXT: ; asyncmark
|
||||
; WITHASYNC-NEXT: .LBB3_1: ; %loop_body
|
||||
; WITHASYNC-NEXT: .LBB4_1: ; %loop_body
|
||||
; WITHASYNC-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; WITHASYNC-NEXT: v_readfirstlane_b32 s7, v2
|
||||
; WITHASYNC-NEXT: s_mov_b32 m0, s7
|
||||
@ -278,7 +381,7 @@ define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, p
|
||||
; WITHASYNC-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; WITHASYNC-NEXT: v_add_u32_e32 v5, v5, v6
|
||||
; WITHASYNC-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; WITHASYNC-NEXT: s_cbranch_execnz .LBB3_1
|
||||
; WITHASYNC-NEXT: s_cbranch_execnz .LBB4_1
|
||||
; WITHASYNC-NEXT: ; %bb.2: ; %epilog
|
||||
; WITHASYNC-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; WITHASYNC-NEXT: ; wait_asyncmark(1)
|
||||
@ -303,7 +406,7 @@ define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, p
|
||||
; WITHOUT-NEXT: s_mov_b32 s6, 2
|
||||
; WITHOUT-NEXT: s_mov_b64 s[4:5], 0
|
||||
; WITHOUT-NEXT: ; asyncmark
|
||||
; WITHOUT-NEXT: .LBB3_1: ; %loop_body
|
||||
; WITHOUT-NEXT: .LBB4_1: ; %loop_body
|
||||
; WITHOUT-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; WITHOUT-NEXT: v_readfirstlane_b32 s7, v2
|
||||
; WITHOUT-NEXT: s_mov_b32 m0, s7
|
||||
@ -318,7 +421,7 @@ define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, p
|
||||
; WITHOUT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; WITHOUT-NEXT: v_add_u32_e32 v5, v5, v6
|
||||
; WITHOUT-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; WITHOUT-NEXT: s_cbranch_execnz .LBB3_1
|
||||
; WITHOUT-NEXT: s_cbranch_execnz .LBB4_1
|
||||
; WITHOUT-NEXT: ; %bb.2: ; %epilog
|
||||
; WITHOUT-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; WITHOUT-NEXT: ; wait_asyncmark(1)
|
||||
@ -395,7 +498,7 @@ define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspac
|
||||
; WITHASYNC-NEXT: v_mov_b32_e32 v13, v8
|
||||
; WITHASYNC-NEXT: s_waitcnt vmcnt(1)
|
||||
; WITHASYNC-NEXT: v_mov_b32_e32 v15, v9
|
||||
; WITHASYNC-NEXT: .LBB4_1: ; %loop_body
|
||||
; WITHASYNC-NEXT: .LBB5_1: ; %loop_body
|
||||
; WITHASYNC-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; WITHASYNC-NEXT: v_readfirstlane_b32 s7, v2
|
||||
; WITHASYNC-NEXT: s_waitcnt vmcnt(1)
|
||||
@ -415,7 +518,7 @@ define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspac
|
||||
; WITHASYNC-NEXT: ; asyncmark
|
||||
; WITHASYNC-NEXT: ; wait_asyncmark(2)
|
||||
; WITHASYNC-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; WITHASYNC-NEXT: s_cbranch_execnz .LBB4_1
|
||||
; WITHASYNC-NEXT: s_cbranch_execnz .LBB5_1
|
||||
; WITHASYNC-NEXT: ; %bb.2: ; %epilog
|
||||
; WITHASYNC-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; WITHASYNC-NEXT: ds_read_b32 v0, v2
|
||||
@ -456,7 +559,7 @@ define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspac
|
||||
; WITHOUT-NEXT: v_mov_b32_e32 v13, v8
|
||||
; WITHOUT-NEXT: s_waitcnt vmcnt(1)
|
||||
; WITHOUT-NEXT: v_mov_b32_e32 v15, v9
|
||||
; WITHOUT-NEXT: .LBB4_1: ; %loop_body
|
||||
; WITHOUT-NEXT: .LBB5_1: ; %loop_body
|
||||
; WITHOUT-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; WITHOUT-NEXT: v_readfirstlane_b32 s7, v2
|
||||
; WITHOUT-NEXT: s_waitcnt vmcnt(1)
|
||||
@ -476,7 +579,7 @@ define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspac
|
||||
; WITHOUT-NEXT: ; asyncmark
|
||||
; WITHOUT-NEXT: ; wait_asyncmark(2)
|
||||
; WITHOUT-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; WITHOUT-NEXT: s_cbranch_execnz .LBB4_1
|
||||
; WITHOUT-NEXT: s_cbranch_execnz .LBB5_1
|
||||
; WITHOUT-NEXT: ; %bb.2: ; %epilog
|
||||
; WITHOUT-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; WITHOUT-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
25
llvm/test/CodeGen/AMDGPU/asyncmark-waitcnt.mir
Normal file
25
llvm/test/CodeGen/AMDGPU/asyncmark-waitcnt.mir
Normal file
@ -0,0 +1,25 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=si-insert-waitcnts -o - %s | FileCheck %s
|
||||
|
||||
--- |
|
||||
define void @asyncmark_preexisting_waitcnt() { unreachable }
|
||||
...
|
||||
|
||||
---
|
||||
name: asyncmark_preexisting_waitcnt
|
||||
tracksRegLiveness: true
|
||||
machineFunctionInfo:
|
||||
occupancy: 16
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0
|
||||
; CHECK-LABEL: name: asyncmark_preexisting_waitcnt
|
||||
; CHECK: liveins: $vgpr0
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: S_WAITCNT 0
|
||||
; CHECK-NEXT: ASYNCMARK
|
||||
; CHECK-NEXT: S_ENDPGM 0
|
||||
S_WAITCNT_soft 0
|
||||
ASYNCMARK
|
||||
S_ENDPGM 0
|
||||
...
|
||||
Loading…
x
Reference in New Issue
Block a user