Asynchronous operations are memory transfers (usually between the global memory and LDS) that are completed independently at an unspecified scope. A thread that requests one or more asynchronous transfers can use async marks to track their completion. The thread waits for each mark to be completed, which indicates that requests initiated in program order before this mark have also completed. For now, we implement asyncmark/wait operations on pre-GFX12 architectures that support "LDS DMA" operations. Future work will extend support to GFX12Plus architectures that support "true" async operations. This is part of a stack split out from #173259 - #180467 - #180466 Co-authored-by: Ryan Mitchell ryan.mitchell@amd.com Fixes: SWDEV-521121
114 lines
5.6 KiB
LLVM
114 lines
5.6 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
|
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
|
|
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
|
|
|
|
define float @raw.buffer.load(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
|
|
; CHECK-LABEL: raw.buffer.load:
|
|
; CHECK: ; %bb.0: ; %main_body
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_mov_b32 m0, s20
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s20
|
|
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:4 glc lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:8 slc lds
|
|
; CHECK-NEXT: ; wait_asyncmark(1)
|
|
; CHECK-NEXT: s_waitcnt vmcnt(2)
|
|
; CHECK-NEXT: ds_read_b32 v0, v0
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
main_body:
|
|
call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
|
|
call void @llvm.amdgcn.wait.asyncmark(i16 1)
|
|
%res = load float, ptr addrspace(3) %lds
|
|
ret float %res
|
|
}
|
|
|
|
define float @raw.ptr.buffer.load(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
|
|
; CHECK-LABEL: raw.ptr.buffer.load:
|
|
; CHECK: ; %bb.0: ; %main_body
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_mov_b32 m0, s20
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s20
|
|
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:4 glc lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:8 slc lds
|
|
; CHECK-NEXT: ; wait_asyncmark(1)
|
|
; CHECK-NEXT: s_waitcnt vmcnt(2)
|
|
; CHECK-NEXT: ds_read_b32 v0, v0
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
main_body:
|
|
call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
|
|
call void @llvm.amdgcn.wait.asyncmark(i16 1)
|
|
%res = load float, ptr addrspace(3) %lds
|
|
ret float %res
|
|
}
|
|
|
|
define float @struct.buffer.load(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
|
|
; CHECK-LABEL: struct.buffer.load:
|
|
; CHECK: ; %bb.0: ; %main_body
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_mov_b32 m0, s20
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 8
|
|
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:4 glc lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:8 slc lds
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s20
|
|
; CHECK-NEXT: ; wait_asyncmark(1)
|
|
; CHECK-NEXT: s_waitcnt vmcnt(2)
|
|
; CHECK-NEXT: ds_read_b32 v0, v0
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
main_body:
|
|
call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2)
|
|
call void @llvm.amdgcn.wait.asyncmark(i16 1)
|
|
%res = load float, ptr addrspace(3) %lds
|
|
ret float %res
|
|
}
|
|
|
|
define float @struct.ptr.buffer.load(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
|
|
; CHECK-LABEL: struct.ptr.buffer.load:
|
|
; CHECK: ; %bb.0: ; %main_body
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_mov_b32 m0, s20
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 8
|
|
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:4 glc lds
|
|
; CHECK-NEXT: ; asyncmark
|
|
; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:8 slc lds
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s20
|
|
; CHECK-NEXT: ; wait_asyncmark(1)
|
|
; CHECK-NEXT: s_waitcnt vmcnt(2)
|
|
; CHECK-NEXT: ds_read_b32 v0, v0
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
main_body:
|
|
call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
|
|
call void @llvm.amdgcn.asyncmark()
|
|
call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2)
|
|
call void @llvm.amdgcn.wait.asyncmark(i16 1)
|
|
%res = load float, ptr addrspace(3) %lds
|
|
ret float %res
|
|
}
|