Dark Steve 7d381f2a56
[AMDGPU] Schedule independent instructions between s_barrier_signal and s_barrier_wait (#172057)
On gfx12+, the unified` s_barrier` is lowered to split
`s_barrier_signal/s_barrier_wait` pairs. By default, the dependency edge
between signal and wait has zero latency, causing the scheduler to emit
them adjacent to each other. This misses the opportunity to hide barrier
latency.

This patch adds synthetic latency to the signal-wait barrier edge to
encourage latency hiding. Independent instructions are scheduled in the
gap between split barrier signal and wait.

The latency is tunable via -amdgpu-barrier-signal-wait-latency.

Fixes: SWDEV-567090
2025-12-16 11:48:50 +05:30

294 lines
13 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
@bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
define void @func1() {
; GFX12-SDAG-LABEL: func1:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_mov_b32 m0, 3
; GFX12-SDAG-NEXT: s_barrier_join m0
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70003
; GFX12-SDAG-NEXT: s_barrier_signal m0
; GFX12-SDAG-NEXT: s_barrier_wait 1
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: func1:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70003
; GFX12-GISEL-NEXT: s_barrier_join 3
; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
define void @func2() {
; GFX12-SDAG-LABEL: func2:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_mov_b32 m0, 1
; GFX12-SDAG-NEXT: s_barrier_join m0
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70001
; GFX12-SDAG-NEXT: s_barrier_signal m0
; GFX12-SDAG-NEXT: s_barrier_wait 1
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: func2:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70001
; GFX12-GISEL-NEXT: s_barrier_join 1
; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
; GFX12-SDAG-LABEL: kernel1:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002
; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0
; GFX12-SDAG-NEXT: s_barrier_init m0
; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48
; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_lshr_b32 s2, s2, 4
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_and_b32 s2, s2, 63
; GFX12-SDAG-NEXT: s_or_b32 s3, 0x90000, s2
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
; GFX12-SDAG-NEXT: s_mov_b32 m0, s3
; GFX12-SDAG-NEXT: s_barrier_init m0
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002
; GFX12-SDAG-NEXT: s_barrier_signal m0
; GFX12-SDAG-NEXT: s_mov_b32 m0, s3
; GFX12-SDAG-NEXT: s_barrier_signal m0
; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
; GFX12-SDAG-NEXT: s_barrier_signal -1
; GFX12-SDAG-NEXT: s_barrier_join m0
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
; GFX12-SDAG-NEXT: s_mov_b32 m0, 2
; GFX12-SDAG-NEXT: s_barrier_wait 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_barrier_leave
; GFX12-SDAG-NEXT: s_get_barrier_state s3, m0
; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
; GFX12-SDAG-NEXT: s_get_barrier_state s2, m0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3]
; GFX12-SDAG-NEXT: s_sext_i32_i16 s3, s3
; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func1@gotpcrel32@lo+8
; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func1@gotpcrel32@hi+16
; GFX12-SDAG-NEXT: s_barrier_signal -1
; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-SDAG-NEXT: s_barrier_wait -1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3]
; GFX12-SDAG-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-SDAG-NEXT: s_sext_i32_i16 s3, s3
; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func2@gotpcrel32@lo+12
; GFX12-SDAG-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func2@gotpcrel32@hi+24
; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX12-SDAG-NEXT: s_get_barrier_state s0, -1
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: kernel1:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_mov_b64 s[12:13], s[4:5]
; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX12-GISEL-NEXT: s_load_b32 s0, s[12:13], 0x2c
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0002
; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0
; GFX12-GISEL-NEXT: s_barrier_init m0
; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshr_b32 s0, s0, 4
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_and_b32 s0, s0, 63
; GFX12-GISEL-NEXT: s_or_b32 s1, s0, 0x90000
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
; GFX12-GISEL-NEXT: s_mov_b32 m0, s1
; GFX12-GISEL-NEXT: s_barrier_init m0
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0002
; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_mov_b32 m0, s1
; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_mov_b32 m0, s0
; GFX12-GISEL-NEXT: s_barrier_signal -1
; GFX12-GISEL-NEXT: s_barrier_join m0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_barrier_leave
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48
; GFX12-GISEL-NEXT: s_get_barrier_state s0, 2
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_get_barrier_state s0, m0
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1]
; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func1@gotpcrel32@lo+8
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func1@gotpcrel32@hi+16
; GFX12-GISEL-NEXT: s_barrier_signal -1
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX12-GISEL-NEXT: s_barrier_wait -1
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0
; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1]
; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func2@gotpcrel32@lo+12
; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func2@gotpcrel32@hi+24
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX12-GISEL-NEXT: s_get_barrier_state s0, -1
; GFX12-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) @bar, i32 12)
call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %in, i32 9)
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12)
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9)
call void @llvm.amdgcn.s.barrier.signal(i32 -1)
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
%isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @llvm.amdgcn.s.barrier.leave(i16 1)
%state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
%state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %in)
call void @llvm.amdgcn.s.barrier()
call void @func1()
call void @func2()
%state3 = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1)
ret void
}
define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
; GFX12-SDAG-LABEL: kernel2:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX12-SDAG-NEXT: s_getpc_b64 s[6:7]
; GFX12-SDAG-NEXT: s_sext_i32_i16 s7, s7
; GFX12-SDAG-NEXT: s_add_co_u32 s6, s6, func2@gotpcrel32@lo+8
; GFX12-SDAG-NEXT: s_add_co_ci_u32 s7, s7, func2@gotpcrel32@hi+16
; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0
; GFX12-SDAG-NEXT: s_load_b64 s[12:13], s[6:7], 0x0
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70002
; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48
; GFX12-SDAG-NEXT: s_barrier_signal m0
; GFX12-SDAG-NEXT: s_mov_b32 m0, 2
; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
; GFX12-SDAG-NEXT: s_barrier_join m0
; GFX12-SDAG-NEXT: s_barrier_wait 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13]
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: kernel2:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_add_co_u32 s8, s4, 48
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s5, 0
; GFX12-GISEL-NEXT: s_getpc_b64 s[4:5]
; GFX12-GISEL-NEXT: s_sext_i32_i16 s5, s5
; GFX12-GISEL-NEXT: s_add_co_u32 s4, s4, func2@gotpcrel32@lo+8
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s5, s5, func2@gotpcrel32@hi+16
; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0
; GFX12-GISEL-NEXT: s_load_b64 s[12:13], s[4:5], 0x0
; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70002
; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_join 2
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13]
; GFX12-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
ret void
}
define amdgpu_ps void @test_barrier_leave_write_to_scc(i32 inreg %val, ptr addrspace(1) %out) {
; GFX12-LABEL: test_barrier_leave_write_to_scc:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_barrier_leave
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_lg_u32 s0, 0
; GFX12-NEXT: s_movk_i32 s0, 0x7b
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_cselect_b32 s0, s0, 0x1c8
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.s.barrier.leave(i16 1)
%cmp = icmp ne i32 %val, 0
%ret = select i1 %cmp, i32 123, i32 456
store i32 %ret, ptr addrspace(1) %out
ret void
}
declare void @llvm.amdgcn.s.barrier() #1
declare void @llvm.amdgcn.s.barrier.wait(i16) #1
declare void @llvm.amdgcn.s.barrier.signal(i32) #1
declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
declare void @llvm.amdgcn.s.barrier.leave(i16) #1
declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1
declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }