The existing "LDS DMA" builtins/intrinsics copy data from global/buffer pointer to LDS. These are now augmented with their ".async" version, where the compiler does not automatically track completion. The completion is now tracked using explicit mark/wait intrinsics, which must be inserted by the user. This makes it possible to write programs with efficient waits in software pipeline loops. The program can now wait for only the oldest outstanding operations to finish, while launching more operations for later use. This change only contains the new names of the builtins/intrinsics, which continue to behave exactly like their non-async counterparts. A later change will implement the actual mark/wait semantics in SIInsertWaitcnts. This is part of a stack split out from #173259: - #180467 - #180466 Fixes: SWDEV-521121
427 lines
18 KiB
LLVM
427 lines
18 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefix=GFX90A
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s --check-prefix=GFX90A
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefix=GFX942
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s --check-prefix=GFX10
|
|
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefix=GFX942-GISEL
|
|
|
|
;; Note: load.to.lds is a wrapper intrinsic around underlying operations.
|
|
;; This is a bare-bones test to ensure that it lowers to the correct instructions.
|
|
|
|
declare void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
|
|
declare void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
|
|
|
|
define amdgpu_ps void @global_load_lds_dword_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
|
|
; GFX90A-LABEL: global_load_lds_dword_vaddr_saddr:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_mov_b32 m0, s0
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: global_load_dword v[0:1], off offset:16 glc lds
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: global_load_lds_dword_vaddr_saddr:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_mov_b32 m0, s0
|
|
; GFX942-NEXT: s_nop 0
|
|
; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:16 sc0
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: global_load_lds_dword_vaddr_saddr:
|
|
; GFX10: ; %bb.0: ; %main_body
|
|
; GFX10-NEXT: s_mov_b32 m0, s0
|
|
; GFX10-NEXT: global_load_dword v[0:1], off offset:16 glc lds
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: global_load_lds_dword_vaddr_saddr:
|
|
; GFX942-GISEL: ; %bb.0: ; %main_body
|
|
; GFX942-GISEL-NEXT: s_mov_b32 m0, s0
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:16 sc0
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
main_body:
|
|
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 16, i32 1)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @buffer_load_async_lds(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
|
|
; GFX90A-LABEL: buffer_load_async_lds:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX90A-NEXT: s_mov_b32 m0, s5
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 glc lds
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: buffer_load_async_lds:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX942-NEXT: s_mov_b32 m0, s5
|
|
; GFX942-NEXT: s_nop 0
|
|
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: buffer_load_async_lds:
|
|
; GFX10: ; %bb.0: ; %main_body
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
|
|
; GFX10-NEXT: s_mov_b32 m0, s5
|
|
; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 glc lds
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: buffer_load_async_lds:
|
|
; GFX942-GISEL: ; %bb.0: ; %main_body
|
|
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
main_body:
|
|
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
|
|
call void @llvm.amdgcn.load.async.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 16, i32 1)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @buffer_load_lds_dword_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
|
|
; GFX90A-LABEL: buffer_load_lds_dword_vaddr_saddr:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX90A-NEXT: s_mov_b32 m0, s5
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 glc lds
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: buffer_load_lds_dword_vaddr_saddr:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX942-NEXT: s_mov_b32 m0, s5
|
|
; GFX942-NEXT: s_nop 0
|
|
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: buffer_load_lds_dword_vaddr_saddr:
|
|
; GFX10: ; %bb.0: ; %main_body
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
|
|
; GFX10-NEXT: s_mov_b32 m0, s5
|
|
; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 glc lds
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: buffer_load_lds_dword_vaddr_saddr:
|
|
; GFX942-GISEL: ; %bb.0: ; %main_body
|
|
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
main_body:
|
|
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
|
|
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 16, i32 1)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @global_load_lds_ushort_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
|
|
; GFX90A-LABEL: global_load_lds_ushort_vaddr_saddr:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_mov_b32 m0, s0
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: global_load_ushort v[0:1], off offset:16 glc lds
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: global_load_lds_ushort_vaddr_saddr:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_mov_b32 m0, s0
|
|
; GFX942-NEXT: s_nop 0
|
|
; GFX942-NEXT: global_load_lds_ushort v[0:1], off offset:16 sc0
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: global_load_lds_ushort_vaddr_saddr:
|
|
; GFX10: ; %bb.0: ; %main_body
|
|
; GFX10-NEXT: s_mov_b32 m0, s0
|
|
; GFX10-NEXT: global_load_ushort v[0:1], off offset:16 glc lds
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: global_load_lds_ushort_vaddr_saddr:
|
|
; GFX942-GISEL: ; %bb.0: ; %main_body
|
|
; GFX942-GISEL-NEXT: s_mov_b32 m0, s0
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: global_load_lds_ushort v[0:1], off offset:16 sc0
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
main_body:
|
|
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 2, i32 16, i32 1)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @buffer_load_lds_ushort_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
|
|
; GFX90A-LABEL: buffer_load_lds_ushort_vaddr_saddr:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX90A-NEXT: s_mov_b32 m0, s5
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_load_ushort v0, s[0:3], 0 offen offset:16 glc lds
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: buffer_load_lds_ushort_vaddr_saddr:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX942-NEXT: s_mov_b32 m0, s5
|
|
; GFX942-NEXT: s_nop 0
|
|
; GFX942-NEXT: buffer_load_ushort v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: buffer_load_lds_ushort_vaddr_saddr:
|
|
; GFX10: ; %bb.0: ; %main_body
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
|
|
; GFX10-NEXT: s_mov_b32 m0, s5
|
|
; GFX10-NEXT: buffer_load_ushort v0, s[0:3], 0 offen offset:16 glc lds
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: buffer_load_lds_ushort_vaddr_saddr:
|
|
; GFX942-GISEL: ; %bb.0: ; %main_body
|
|
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: buffer_load_ushort v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
main_body:
|
|
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
|
|
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 2, i32 16, i32 1)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @global_load_lds_ubyte_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
|
|
; GFX90A-LABEL: global_load_lds_ubyte_vaddr_saddr:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_mov_b32 m0, s0
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: global_load_ubyte v[0:1], off offset:16 glc lds
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: global_load_lds_ubyte_vaddr_saddr:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_mov_b32 m0, s0
|
|
; GFX942-NEXT: s_nop 0
|
|
; GFX942-NEXT: global_load_lds_ubyte v[0:1], off offset:16 sc0
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: global_load_lds_ubyte_vaddr_saddr:
|
|
; GFX10: ; %bb.0: ; %main_body
|
|
; GFX10-NEXT: s_mov_b32 m0, s0
|
|
; GFX10-NEXT: global_load_ubyte v[0:1], off offset:16 glc lds
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: global_load_lds_ubyte_vaddr_saddr:
|
|
; GFX942-GISEL: ; %bb.0: ; %main_body
|
|
; GFX942-GISEL-NEXT: s_mov_b32 m0, s0
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: global_load_lds_ubyte v[0:1], off offset:16 sc0
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
main_body:
|
|
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 1, i32 16, i32 1)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @buffer_load_lds_ubyte_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
|
|
; GFX90A-LABEL: buffer_load_lds_ubyte_vaddr_saddr:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX90A-NEXT: s_mov_b32 m0, s5
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_load_ubyte v0, s[0:3], 0 offen offset:16 glc lds
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: buffer_load_lds_ubyte_vaddr_saddr:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX942-NEXT: s_mov_b32 m0, s5
|
|
; GFX942-NEXT: s_nop 0
|
|
; GFX942-NEXT: buffer_load_ubyte v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: buffer_load_lds_ubyte_vaddr_saddr:
|
|
; GFX10: ; %bb.0: ; %main_body
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
|
|
; GFX10-NEXT: s_mov_b32 m0, s5
|
|
; GFX10-NEXT: buffer_load_ubyte v0, s[0:3], 0 offen offset:16 glc lds
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: buffer_load_lds_ubyte_vaddr_saddr:
|
|
; GFX942-GISEL: ; %bb.0: ; %main_body
|
|
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: buffer_load_ubyte v0, s[0:3], 0 offen offset:16 sc0 lds
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
main_body:
|
|
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
|
|
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 1, i32 16, i32 1)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) inreg %lptr) {
|
|
; GFX90A-LABEL: global_load_lds_dword_volatile:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_mov_b32 m0, s0
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: global_load_dword v[0:1], off glc lds
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: global_load_dword v[0:1], off offset:256 lds
|
|
; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: global_load_lds_dword_volatile:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_mov_b32 m0, s0
|
|
; GFX942-NEXT: s_nop 0
|
|
; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:256
|
|
; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: global_load_lds_dword_volatile:
|
|
; GFX10: ; %bb.0: ; %main_body
|
|
; GFX10-NEXT: s_mov_b32 m0, s0
|
|
; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_dword v[0:1], off offset:256 lds
|
|
; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
|
|
; GFX942-GISEL: ; %bb.0: ; %main_body
|
|
; GFX942-GISEL-NEXT: s_mov_b32 m0, s0
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
|
|
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:256
|
|
; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
main_body:
|
|
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
|
|
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
|
|
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
|
|
; GFX90A-LABEL: buffer_load_lds_dword_volatile:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_mov_b32 m0, s5
|
|
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
|
|
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: buffer_load_lds_dword_volatile:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_mov_b32 m0, s5
|
|
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
|
|
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: buffer_load_lds_dword_volatile:
|
|
; GFX10: ; %bb.0: ; %main_body
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
|
|
; GFX10-NEXT: s_mov_b32 m0, s5
|
|
; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
|
|
; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: buffer_load_lds_dword_volatile:
|
|
; GFX942-GISEL: ; %bb.0: ; %main_body
|
|
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
|
|
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
|
|
; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
main_body:
|
|
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
|
|
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
|
|
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
|
|
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) inreg %lptr) {
|
|
; GFX90A-LABEL: global_load_lds_dword_nontemporal:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: s_mov_b32 m0, s2
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: global_load_dword v0, s[0:1] glc slc lds
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: global_load_lds_dword_nontemporal:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_mov_b32 m0, s2
|
|
; GFX942-NEXT: s_nop 0
|
|
; GFX942-NEXT: global_load_lds_dword v0, s[0:1] nt
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: global_load_lds_dword_nontemporal:
|
|
; GFX10: ; %bb.0: ; %main_body
|
|
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX10-NEXT: s_mov_b32 m0, s2
|
|
; GFX10-NEXT: global_load_dword v0, s[0:1] slc lds
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
|
|
; GFX942-GISEL: ; %bb.0: ; %main_body
|
|
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: global_load_lds_dword v0, s[0:1] nt
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
main_body:
|
|
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @buffer_load_lds_dword_nontemporal(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
|
|
; GFX90A-LABEL: buffer_load_lds_dword_nontemporal:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX90A-NEXT: s_mov_b32 m0, s5
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: buffer_load_lds_dword_nontemporal:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX942-NEXT: s_mov_b32 m0, s5
|
|
; GFX942-NEXT: s_nop 0
|
|
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: buffer_load_lds_dword_nontemporal:
|
|
; GFX10: ; %bb.0: ; %main_body
|
|
; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
|
|
; GFX10-NEXT: s_mov_b32 m0, s5
|
|
; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: buffer_load_lds_dword_nontemporal:
|
|
; GFX942-GISEL: ; %bb.0: ; %main_body
|
|
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
|
|
; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
main_body:
|
|
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
|
|
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
|
|
ret void
|
|
}
|
|
|
|
!0 = !{i32 1}
|