llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll
Sameer Sahasrabuddhe b02b395a1e
[AMDGPU] Asynchronous loads from global/buffer to LDS on pre-GFX12 (#180466)
The existing "LDS DMA" builtins/intrinsics copy data from global/buffer
pointer to LDS. These are now augmented with their ".async" version,
where the compiler does not automatically track completion. The
completion is now tracked using explicit mark/wait intrinsics, which
must be inserted by the user. This makes it possible to write programs
with efficient waits in software pipeline loops. The program can now
wait for only the oldest outstanding operations to finish, while
launching more operations for later use.

This change only contains the new names of the builtins/intrinsics,
which continue to behave exactly like their non-async counterparts. A
later change will implement the actual mark/wait semantics in
SIInsertWaitcnts.

This is part of a stack split out from #173259:
- #180467
- #180466

Fixes: SWDEV-521121
2026-02-11 05:26:58 +00:00

150 lines
6.9 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-GISEL %s
; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
; ERR-SDAG: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.load.to.lds
; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.load.to.lds),
;; Note: this is a bare-bones test to make sure that amdgcn.load.to.lds lowers to
;; the correct intrinsic.
declare void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
declare void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
;---------------------------------------------------------------------y
; dwordx3
;---------------------------------------------------------------------
define amdgpu_ps void @global_load_lds_dwordx3_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
; GFX950-LABEL: global_load_lds_dwordx3_vaddr_saddr:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_mov_b32 m0, s0
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
; GFX950-NEXT: s_endpgm
; GFX950-SDAG-LABEL: global_load_lds_dwordx3_vaddr_saddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_mov_b32 m0, s0
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx3_vaddr_saddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_mov_b32 m0, s0
; GFX950-GISEL-NEXT: s_nop 0
; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
; GFX950-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
ret void
}
define amdgpu_ps void @buffer_load_lds_async(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
; GFX950-SDAG-LABEL: buffer_load_lds_async:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_add_u32_e32 v0, s4, v0
; GFX950-SDAG-NEXT: s_mov_b32 m0, s5
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: buffer_load_lds_async:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
; GFX950-GISEL-NEXT: s_mov_b32 m0, s5
; GFX950-GISEL-NEXT: s_nop 0
; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX950-GISEL-NEXT: s_endpgm
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
call void @llvm.amdgcn.load.async.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
ret void
}
define amdgpu_ps void @buffer_load_lds_dwordx3_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
; GFX950-LABEL: buffer_load_lds_dwordx3_vaddr_saddr:
; GFX950: ; %bb.0:
; GFX950-NEXT: v_add_u32_e32 v0, s4, v0
; GFX950-NEXT: s_mov_b32 m0, s5
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX950-NEXT: s_endpgm
; GFX950-SDAG-LABEL: buffer_load_lds_dwordx3_vaddr_saddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_add_u32_e32 v0, s4, v0
; GFX950-SDAG-NEXT: s_mov_b32 m0, s5
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: buffer_load_lds_dwordx3_vaddr_saddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
; GFX950-GISEL-NEXT: s_mov_b32 m0, s5
; GFX950-GISEL-NEXT: s_nop 0
; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX950-GISEL-NEXT: s_endpgm
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
ret void
}
;---------------------------------------------------------------------
; dwordx4
;---------------------------------------------------------------------
define amdgpu_ps void @global_load_lds_dwordx4_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
; GFX950-LABEL: global_load_lds_dwordx4_vaddr_saddr:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_mov_b32 m0, s0
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
; GFX950-NEXT: s_endpgm
; GFX950-SDAG-LABEL: global_load_lds_dwordx4_vaddr_saddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_mov_b32 m0, s0
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx4_vaddr_saddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_mov_b32 m0, s0
; GFX950-GISEL-NEXT: s_nop 0
; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
; GFX950-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1)
ret void
}
define amdgpu_ps void @buffer_load_lds_dwordx4_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
; GFX950-LABEL: buffer_load_lds_dwordx4_vaddr_saddr:
; GFX950: ; %bb.0:
; GFX950-NEXT: v_add_u32_e32 v0, s4, v0
; GFX950-NEXT: s_mov_b32 m0, s5
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX950-NEXT: s_endpgm
; GFX950-SDAG-LABEL: buffer_load_lds_dwordx4_vaddr_saddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_add_u32_e32 v0, s4, v0
; GFX950-SDAG-NEXT: s_mov_b32 m0, s5
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: buffer_load_lds_dwordx4_vaddr_saddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
; GFX950-GISEL-NEXT: s_mov_b32 m0, s5
; GFX950-GISEL-NEXT: s_nop 0
; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX950-GISEL-NEXT: s_endpgm
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1)
ret void
}