llvm-project/llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll
Sameer Sahasrabuddhe 128437fb6a
[AMDGPU] Introduce asyncmark/wait intrinsics (#180467)
Asynchronous operations are memory transfers (usually between the global
memory and LDS) that are completed independently at an unspecified
scope. A thread that requests one or more asynchronous transfers can use
async marks to track their completion. The thread waits for each mark to
be completed, which indicates that requests initiated in program order
before this mark have also completed.

For now, we implement asyncmark/wait operations on pre-GFX12
architectures that support "LDS DMA" operations. Future work will extend
support to GFX12Plus architectures that support "true" async operations.

This is part of a stack split out from #173259
- #180467
- #180466

Co-authored-by: Ryan Mitchell ryan.mitchell@amd.com

Fixes: SWDEV-521121
2026-02-11 07:15:51 +00:00

114 lines
5.6 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
define float @raw.buffer.load(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
; CHECK-LABEL: raw.buffer.load:
; CHECK: ; %bb.0: ; %main_body
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 m0, s20
; CHECK-NEXT: v_mov_b32_e32 v0, s20
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 lds
; CHECK-NEXT: ; asyncmark
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:4 glc lds
; CHECK-NEXT: ; asyncmark
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:8 slc lds
; CHECK-NEXT: ; wait_asyncmark(1)
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: ds_read_b32 v0, v0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
main_body:
call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.asyncmark()
call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
call void @llvm.amdgcn.asyncmark()
call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
call void @llvm.amdgcn.wait.asyncmark(i16 1)
%res = load float, ptr addrspace(3) %lds
ret float %res
}
define float @raw.ptr.buffer.load(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
; CHECK-LABEL: raw.ptr.buffer.load:
; CHECK: ; %bb.0: ; %main_body
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 m0, s20
; CHECK-NEXT: v_mov_b32_e32 v0, s20
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 lds
; CHECK-NEXT: ; asyncmark
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:4 glc lds
; CHECK-NEXT: ; asyncmark
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:8 slc lds
; CHECK-NEXT: ; wait_asyncmark(1)
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: ds_read_b32 v0, v0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
main_body:
call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.asyncmark()
call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
call void @llvm.amdgcn.asyncmark()
call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
call void @llvm.amdgcn.wait.asyncmark(i16 1)
%res = load float, ptr addrspace(3) %lds
ret float %res
}
define float @struct.buffer.load(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
; CHECK-LABEL: struct.buffer.load:
; CHECK: ; %bb.0: ; %main_body
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 m0, s20
; CHECK-NEXT: v_mov_b32_e32 v0, 8
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen lds
; CHECK-NEXT: ; asyncmark
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:4 glc lds
; CHECK-NEXT: ; asyncmark
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:8 slc lds
; CHECK-NEXT: v_mov_b32_e32 v0, s20
; CHECK-NEXT: ; wait_asyncmark(1)
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: ds_read_b32 v0, v0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
main_body:
call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.asyncmark()
call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
call void @llvm.amdgcn.asyncmark()
call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2)
call void @llvm.amdgcn.wait.asyncmark(i16 1)
%res = load float, ptr addrspace(3) %lds
ret float %res
}
define float @struct.ptr.buffer.load(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
; CHECK-LABEL: struct.ptr.buffer.load:
; CHECK: ; %bb.0: ; %main_body
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 m0, s20
; CHECK-NEXT: v_mov_b32_e32 v0, 8
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen lds
; CHECK-NEXT: ; asyncmark
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:4 glc lds
; CHECK-NEXT: ; asyncmark
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:8 slc lds
; CHECK-NEXT: v_mov_b32_e32 v0, s20
; CHECK-NEXT: ; wait_asyncmark(1)
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: ds_read_b32 v0, v0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
main_body:
call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.asyncmark()
call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
call void @llvm.amdgcn.asyncmark()
call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2)
call void @llvm.amdgcn.wait.asyncmark(i16 1)
%res = load float, ptr addrspace(3) %lds
ret float %res
}