The existing "LDS DMA" builtins/intrinsics copy data from global/buffer pointer to LDS. These are now augmented with their ".async" version, where the compiler does not automatically track completion. The completion is now tracked using explicit mark/wait intrinsics, which must be inserted by the user. This makes it possible to write programs with efficient waits in software pipeline loops. The program can now wait for only the oldest outstanding operations to finish, while launching more operations for later use. This change only contains the new names of the builtins/intrinsics, which continue to behave exactly like their non-async counterparts. A later change will implement the actual mark/wait semantics in SIInsertWaitcnts. This is part of a stack split out from #173259: - #180467 - #180466 Fixes: SWDEV-521121
150 lines
6.9 KiB
LLVM
150 lines
6.9 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-SDAG %s
|
|
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-GISEL %s
|
|
|
|
; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
|
|
; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
|
|
|
|
; ERR-SDAG: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.load.to.lds
|
|
|
|
; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.load.to.lds),
|
|
|
|
;; Note: this is a bare-bones test to make sure that amdgcn.load.to.lds lowers to
|
|
;; the correct intrinsic.
|
|
|
|
declare void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
|
|
declare void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
|
|
|
|
;---------------------------------------------------------------------y
|
|
; dwordx3
|
|
;---------------------------------------------------------------------
|
|
|
|
define amdgpu_ps void @global_load_lds_dwordx3_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
|
|
; GFX950-LABEL: global_load_lds_dwordx3_vaddr_saddr:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_mov_b32 m0, s0
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
|
|
; GFX950-NEXT: s_endpgm
|
|
; GFX950-SDAG-LABEL: global_load_lds_dwordx3_vaddr_saddr:
|
|
; GFX950-SDAG: ; %bb.0:
|
|
; GFX950-SDAG-NEXT: s_mov_b32 m0, s0
|
|
; GFX950-SDAG-NEXT: s_nop 0
|
|
; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
|
|
; GFX950-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX950-GISEL-LABEL: global_load_lds_dwordx3_vaddr_saddr:
|
|
; GFX950-GISEL: ; %bb.0:
|
|
; GFX950-GISEL-NEXT: s_mov_b32 m0, s0
|
|
; GFX950-GISEL-NEXT: s_nop 0
|
|
; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
|
|
; GFX950-GISEL-NEXT: s_endpgm
|
|
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @buffer_load_lds_async(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
|
|
; GFX950-SDAG-LABEL: buffer_load_lds_async:
|
|
; GFX950-SDAG: ; %bb.0:
|
|
; GFX950-SDAG-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX950-SDAG-NEXT: s_mov_b32 m0, s5
|
|
; GFX950-SDAG-NEXT: s_nop 0
|
|
; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX950-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX950-GISEL-LABEL: buffer_load_lds_async:
|
|
; GFX950-GISEL: ; %bb.0:
|
|
; GFX950-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX950-GISEL-NEXT: s_mov_b32 m0, s5
|
|
; GFX950-GISEL-NEXT: s_nop 0
|
|
; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX950-GISEL-NEXT: s_endpgm
|
|
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
|
|
call void @llvm.amdgcn.load.async.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @buffer_load_lds_dwordx3_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
|
|
; GFX950-LABEL: buffer_load_lds_dwordx3_vaddr_saddr:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX950-NEXT: s_mov_b32 m0, s5
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX950-NEXT: s_endpgm
|
|
; GFX950-SDAG-LABEL: buffer_load_lds_dwordx3_vaddr_saddr:
|
|
; GFX950-SDAG: ; %bb.0:
|
|
; GFX950-SDAG-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX950-SDAG-NEXT: s_mov_b32 m0, s5
|
|
; GFX950-SDAG-NEXT: s_nop 0
|
|
; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX950-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX950-GISEL-LABEL: buffer_load_lds_dwordx3_vaddr_saddr:
|
|
; GFX950-GISEL: ; %bb.0:
|
|
; GFX950-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX950-GISEL-NEXT: s_mov_b32 m0, s5
|
|
; GFX950-GISEL-NEXT: s_nop 0
|
|
; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX950-GISEL-NEXT: s_endpgm
|
|
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
|
|
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; dwordx4
|
|
;---------------------------------------------------------------------
|
|
|
|
define amdgpu_ps void @global_load_lds_dwordx4_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
|
|
; GFX950-LABEL: global_load_lds_dwordx4_vaddr_saddr:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_mov_b32 m0, s0
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
|
|
; GFX950-NEXT: s_endpgm
|
|
; GFX950-SDAG-LABEL: global_load_lds_dwordx4_vaddr_saddr:
|
|
; GFX950-SDAG: ; %bb.0:
|
|
; GFX950-SDAG-NEXT: s_mov_b32 m0, s0
|
|
; GFX950-SDAG-NEXT: s_nop 0
|
|
; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
|
|
; GFX950-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX950-GISEL-LABEL: global_load_lds_dwordx4_vaddr_saddr:
|
|
; GFX950-GISEL: ; %bb.0:
|
|
; GFX950-GISEL-NEXT: s_mov_b32 m0, s0
|
|
; GFX950-GISEL-NEXT: s_nop 0
|
|
; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
|
|
; GFX950-GISEL-NEXT: s_endpgm
|
|
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @buffer_load_lds_dwordx4_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
|
|
; GFX950-LABEL: buffer_load_lds_dwordx4_vaddr_saddr:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX950-NEXT: s_mov_b32 m0, s5
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX950-NEXT: s_endpgm
|
|
; GFX950-SDAG-LABEL: buffer_load_lds_dwordx4_vaddr_saddr:
|
|
; GFX950-SDAG: ; %bb.0:
|
|
; GFX950-SDAG-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX950-SDAG-NEXT: s_mov_b32 m0, s5
|
|
; GFX950-SDAG-NEXT: s_nop 0
|
|
; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX950-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX950-GISEL-LABEL: buffer_load_lds_dwordx4_vaddr_saddr:
|
|
; GFX950-GISEL: ; %bb.0:
|
|
; GFX950-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX950-GISEL-NEXT: s_mov_b32 m0, s5
|
|
; GFX950-GISEL-NEXT: s_nop 0
|
|
; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX950-GISEL-NEXT: s_endpgm
|
|
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
|
|
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1)
|
|
ret void
|
|
}
|