On gfx12+, the unified` s_barrier` is lowered to split `s_barrier_signal/s_barrier_wait` pairs. By default, the dependency edge between signal and wait has zero latency, causing the scheduler to emit them adjacent to each other. This misses the opportunity to hide barrier latency. This patch adds synthetic latency to the signal-wait barrier edge to encourage latency hiding. Independent instructions are scheduled in the gap between split barrier signal and wait. The latency is tunable via -amdgpu-barrier-signal-wait-latency. Fixes: SWDEV-567090
294 lines
13 KiB
LLVM
294 lines
13 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
|
|
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
|
|
|
|
@bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
|
|
@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
|
|
@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
|
|
|
|
define void @func1() {
|
|
; GFX12-SDAG-LABEL: func1:
|
|
; GFX12-SDAG: ; %bb.0:
|
|
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SDAG-NEXT: s_mov_b32 m0, 3
|
|
; GFX12-SDAG-NEXT: s_barrier_join m0
|
|
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70003
|
|
; GFX12-SDAG-NEXT: s_barrier_signal m0
|
|
; GFX12-SDAG-NEXT: s_barrier_wait 1
|
|
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-GISEL-LABEL: func1:
|
|
; GFX12-GISEL: ; %bb.0:
|
|
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70003
|
|
; GFX12-GISEL-NEXT: s_barrier_join 3
|
|
; GFX12-GISEL-NEXT: s_barrier_signal m0
|
|
; GFX12-GISEL-NEXT: s_barrier_wait 1
|
|
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
|
|
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
|
|
call void @llvm.amdgcn.s.barrier.wait(i16 1)
|
|
ret void
|
|
}
|
|
|
|
define void @func2() {
|
|
; GFX12-SDAG-LABEL: func2:
|
|
; GFX12-SDAG: ; %bb.0:
|
|
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SDAG-NEXT: s_mov_b32 m0, 1
|
|
; GFX12-SDAG-NEXT: s_barrier_join m0
|
|
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70001
|
|
; GFX12-SDAG-NEXT: s_barrier_signal m0
|
|
; GFX12-SDAG-NEXT: s_barrier_wait 1
|
|
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-GISEL-LABEL: func2:
|
|
; GFX12-GISEL: ; %bb.0:
|
|
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70001
|
|
; GFX12-GISEL-NEXT: s_barrier_join 1
|
|
; GFX12-GISEL-NEXT: s_barrier_signal m0
|
|
; GFX12-GISEL-NEXT: s_barrier_wait 1
|
|
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
|
|
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
|
|
call void @llvm.amdgcn.s.barrier.wait(i16 1)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
|
|
; GFX12-SDAG-LABEL: kernel1:
|
|
; GFX12-SDAG: ; %bb.0:
|
|
; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
|
|
; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
|
|
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002
|
|
; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0
|
|
; GFX12-SDAG-NEXT: s_barrier_init m0
|
|
; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48
|
|
; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
|
|
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
|
|
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SDAG-NEXT: s_lshr_b32 s2, s2, 4
|
|
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-SDAG-NEXT: s_and_b32 s2, s2, 63
|
|
; GFX12-SDAG-NEXT: s_or_b32 s3, 0x90000, s2
|
|
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
|
|
; GFX12-SDAG-NEXT: s_mov_b32 m0, s3
|
|
; GFX12-SDAG-NEXT: s_barrier_init m0
|
|
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002
|
|
; GFX12-SDAG-NEXT: s_barrier_signal m0
|
|
; GFX12-SDAG-NEXT: s_mov_b32 m0, s3
|
|
; GFX12-SDAG-NEXT: s_barrier_signal m0
|
|
; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
|
|
; GFX12-SDAG-NEXT: s_barrier_signal -1
|
|
; GFX12-SDAG-NEXT: s_barrier_join m0
|
|
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
|
|
; GFX12-SDAG-NEXT: s_mov_b32 m0, 2
|
|
; GFX12-SDAG-NEXT: s_barrier_wait 1
|
|
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SDAG-NEXT: s_barrier_leave
|
|
; GFX12-SDAG-NEXT: s_get_barrier_state s3, m0
|
|
; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
|
|
; GFX12-SDAG-NEXT: s_get_barrier_state s2, m0
|
|
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3]
|
|
; GFX12-SDAG-NEXT: s_sext_i32_i16 s3, s3
|
|
; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func1@gotpcrel32@lo+8
|
|
; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func1@gotpcrel32@hi+16
|
|
; GFX12-SDAG-NEXT: s_barrier_signal -1
|
|
; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
|
|
; GFX12-SDAG-NEXT: s_barrier_wait -1
|
|
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3]
|
|
; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3]
|
|
; GFX12-SDAG-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-SDAG-NEXT: s_sext_i32_i16 s3, s3
|
|
; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func2@gotpcrel32@lo+12
|
|
; GFX12-SDAG-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func2@gotpcrel32@hi+24
|
|
; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
|
|
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3]
|
|
; GFX12-SDAG-NEXT: s_get_barrier_state s0, -1
|
|
; GFX12-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX12-GISEL-LABEL: kernel1:
|
|
; GFX12-GISEL: ; %bb.0:
|
|
; GFX12-GISEL-NEXT: s_mov_b64 s[12:13], s[4:5]
|
|
; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
|
|
; GFX12-GISEL-NEXT: s_load_b32 s0, s[12:13], 0x2c
|
|
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0002
|
|
; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0
|
|
; GFX12-GISEL-NEXT: s_barrier_init m0
|
|
; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
|
|
; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
|
|
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_lshr_b32 s0, s0, 4
|
|
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-GISEL-NEXT: s_and_b32 s0, s0, 63
|
|
; GFX12-GISEL-NEXT: s_or_b32 s1, s0, 0x90000
|
|
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
|
|
; GFX12-GISEL-NEXT: s_mov_b32 m0, s1
|
|
; GFX12-GISEL-NEXT: s_barrier_init m0
|
|
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0002
|
|
; GFX12-GISEL-NEXT: s_barrier_signal m0
|
|
; GFX12-GISEL-NEXT: s_mov_b32 m0, s1
|
|
; GFX12-GISEL-NEXT: s_barrier_signal m0
|
|
; GFX12-GISEL-NEXT: s_mov_b32 m0, s0
|
|
; GFX12-GISEL-NEXT: s_barrier_signal -1
|
|
; GFX12-GISEL-NEXT: s_barrier_join m0
|
|
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
|
|
; GFX12-GISEL-NEXT: s_barrier_wait 1
|
|
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_barrier_leave
|
|
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48
|
|
; GFX12-GISEL-NEXT: s_get_barrier_state s0, 2
|
|
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_get_barrier_state s0, m0
|
|
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0
|
|
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1]
|
|
; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1
|
|
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func1@gotpcrel32@lo+8
|
|
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func1@gotpcrel32@hi+16
|
|
; GFX12-GISEL-NEXT: s_barrier_signal -1
|
|
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
|
|
; GFX12-GISEL-NEXT: s_barrier_wait -1
|
|
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
|
; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48
|
|
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0
|
|
; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1]
|
|
; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1
|
|
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func2@gotpcrel32@lo+12
|
|
; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func2@gotpcrel32@hi+24
|
|
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
|
|
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
|
; GFX12-GISEL-NEXT: s_get_barrier_state s0, -1
|
|
; GFX12-GISEL-NEXT: s_endpgm
|
|
call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) @bar, i32 12)
|
|
call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %in, i32 9)
|
|
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12)
|
|
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9)
|
|
call void @llvm.amdgcn.s.barrier.signal(i32 -1)
|
|
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
|
|
%isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
|
|
call void @llvm.amdgcn.s.barrier.wait(i16 1)
|
|
call void @llvm.amdgcn.s.barrier.leave(i16 1)
|
|
%state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
|
|
%state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %in)
|
|
call void @llvm.amdgcn.s.barrier()
|
|
call void @func1()
|
|
call void @func2()
|
|
%state3 = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
|
|
; GFX12-SDAG-LABEL: kernel2:
|
|
; GFX12-SDAG: ; %bb.0:
|
|
; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
|
|
; GFX12-SDAG-NEXT: s_getpc_b64 s[6:7]
|
|
; GFX12-SDAG-NEXT: s_sext_i32_i16 s7, s7
|
|
; GFX12-SDAG-NEXT: s_add_co_u32 s6, s6, func2@gotpcrel32@lo+8
|
|
; GFX12-SDAG-NEXT: s_add_co_ci_u32 s7, s7, func2@gotpcrel32@hi+16
|
|
; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0
|
|
; GFX12-SDAG-NEXT: s_load_b64 s[12:13], s[6:7], 0x0
|
|
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70002
|
|
; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48
|
|
; GFX12-SDAG-NEXT: s_barrier_signal m0
|
|
; GFX12-SDAG-NEXT: s_mov_b32 m0, 2
|
|
; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
|
|
; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
|
|
; GFX12-SDAG-NEXT: s_barrier_join m0
|
|
; GFX12-SDAG-NEXT: s_barrier_wait 1
|
|
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13]
|
|
; GFX12-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX12-GISEL-LABEL: kernel2:
|
|
; GFX12-GISEL: ; %bb.0:
|
|
; GFX12-GISEL-NEXT: s_add_co_u32 s8, s4, 48
|
|
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s5, 0
|
|
; GFX12-GISEL-NEXT: s_getpc_b64 s[4:5]
|
|
; GFX12-GISEL-NEXT: s_sext_i32_i16 s5, s5
|
|
; GFX12-GISEL-NEXT: s_add_co_u32 s4, s4, func2@gotpcrel32@lo+8
|
|
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s5, s5, func2@gotpcrel32@hi+16
|
|
; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0
|
|
; GFX12-GISEL-NEXT: s_load_b64 s[12:13], s[4:5], 0x0
|
|
; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
|
|
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70002
|
|
; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
|
|
; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
|
|
; GFX12-GISEL-NEXT: s_barrier_signal m0
|
|
; GFX12-GISEL-NEXT: s_barrier_join 2
|
|
; GFX12-GISEL-NEXT: s_barrier_wait 1
|
|
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13]
|
|
; GFX12-GISEL-NEXT: s_endpgm
|
|
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
|
|
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
|
|
call void @llvm.amdgcn.s.barrier.wait(i16 1)
|
|
|
|
call void @func2()
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_barrier_leave_write_to_scc(i32 inreg %val, ptr addrspace(1) %out) {
|
|
; GFX12-LABEL: test_barrier_leave_write_to_scc:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_barrier_leave
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_lg_u32 s0, 0
|
|
; GFX12-NEXT: s_movk_i32 s0, 0x7b
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_cselect_b32 s0, s0, 0x1c8
|
|
; GFX12-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
|
|
; GFX12-NEXT: s_endpgm
|
|
call void @llvm.amdgcn.s.barrier.leave(i16 1)
|
|
%cmp = icmp ne i32 %val, 0
|
|
%ret = select i1 %cmp, i32 123, i32 456
|
|
store i32 %ret, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.amdgcn.s.barrier() #1
|
|
declare void @llvm.amdgcn.s.barrier.wait(i16) #1
|
|
declare void @llvm.amdgcn.s.barrier.signal(i32) #1
|
|
declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
|
|
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
|
|
declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
|
|
declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
|
|
declare void @llvm.amdgcn.s.barrier.leave(i16) #1
|
|
declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1
|
|
declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { convergent nounwind }
|
|
attributes #2 = { nounwind readnone }
|