llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
Sameer Sahasrabuddhe b02b395a1e
[AMDGPU] Asynchronous loads from global/buffer to LDS on pre-GFX12 (#180466)
The existing "LDS DMA" builtins/intrinsics copy data from global/buffer
pointer to LDS. These are now augmented with their ".async" version,
where the compiler does not automatically track completion. The
completion is now tracked using explicit mark/wait intrinsics, which
must be inserted by the user. This makes it possible to write programs
with efficient waits in software pipeline loops. The program can now
wait for only the oldest outstanding operations to finish, while
launching more operations for later use.

This change only contains the new names of the builtins/intrinsics,
which continue to behave exactly like their non-async counterparts. A
later change will implement the actual mark/wait semantics in
SIInsertWaitcnts.

This is part of a stack split out from #173259:
- #180467
- #180466

Fixes: SWDEV-521121
2026-02-11 05:26:58 +00:00

427 lines
18 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefix=GFX90A
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s --check-prefix=GFX90A
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefix=GFX942
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s --check-prefix=GFX10
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefix=GFX942-GISEL
;; Note: load.to.lds is a wrapper intrinsic around underlying operations.
;; This is a bare-bones test to ensure that it lowers to the correct instructions.
declare void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
declare void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
define amdgpu_ps void @global_load_lds_dword_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
; GFX90A-LABEL: global_load_lds_dword_vaddr_saddr:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_mov_b32 m0, s0
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_load_dword v[0:1], off offset:16 glc lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: global_load_lds_dword_vaddr_saddr:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: s_mov_b32 m0, s0
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:16 sc0
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: global_load_lds_dword_vaddr_saddr:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 m0, s0
; GFX10-NEXT: global_load_dword v[0:1], off offset:16 glc lds
; GFX10-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: global_load_lds_dword_vaddr_saddr:
; GFX942-GISEL: ; %bb.0: ; %main_body
; GFX942-GISEL-NEXT: s_mov_b32 m0, s0
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:16 sc0
; GFX942-GISEL-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 16, i32 1)
ret void
}
define amdgpu_ps void @buffer_load_async_lds(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
; GFX90A-LABEL: buffer_load_async_lds:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: s_mov_b32 m0, s5
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 glc lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: buffer_load_async_lds:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: s_mov_b32 m0, s5
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: buffer_load_async_lds:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
; GFX10-NEXT: s_mov_b32 m0, s5
; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 glc lds
; GFX10-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: buffer_load_async_lds:
; GFX942-GISEL: ; %bb.0: ; %main_body
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX942-GISEL-NEXT: s_endpgm
main_body:
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
call void @llvm.amdgcn.load.async.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 16, i32 1)
ret void
}
define amdgpu_ps void @buffer_load_lds_dword_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
; GFX90A-LABEL: buffer_load_lds_dword_vaddr_saddr:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: s_mov_b32 m0, s5
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 glc lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: buffer_load_lds_dword_vaddr_saddr:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: s_mov_b32 m0, s5
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: buffer_load_lds_dword_vaddr_saddr:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
; GFX10-NEXT: s_mov_b32 m0, s5
; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 glc lds
; GFX10-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: buffer_load_lds_dword_vaddr_saddr:
; GFX942-GISEL: ; %bb.0: ; %main_body
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX942-GISEL-NEXT: s_endpgm
main_body:
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 16, i32 1)
ret void
}
define amdgpu_ps void @global_load_lds_ushort_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
; GFX90A-LABEL: global_load_lds_ushort_vaddr_saddr:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_mov_b32 m0, s0
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_load_ushort v[0:1], off offset:16 glc lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: global_load_lds_ushort_vaddr_saddr:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: s_mov_b32 m0, s0
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_load_lds_ushort v[0:1], off offset:16 sc0
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: global_load_lds_ushort_vaddr_saddr:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 m0, s0
; GFX10-NEXT: global_load_ushort v[0:1], off offset:16 glc lds
; GFX10-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: global_load_lds_ushort_vaddr_saddr:
; GFX942-GISEL: ; %bb.0: ; %main_body
; GFX942-GISEL-NEXT: s_mov_b32 m0, s0
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: global_load_lds_ushort v[0:1], off offset:16 sc0
; GFX942-GISEL-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 2, i32 16, i32 1)
ret void
}
define amdgpu_ps void @buffer_load_lds_ushort_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
; GFX90A-LABEL: buffer_load_lds_ushort_vaddr_saddr:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: s_mov_b32 m0, s5
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_ushort v0, s[0:3], 0 offen offset:16 glc lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: buffer_load_lds_ushort_vaddr_saddr:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: s_mov_b32 m0, s5
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: buffer_load_ushort v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: buffer_load_lds_ushort_vaddr_saddr:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
; GFX10-NEXT: s_mov_b32 m0, s5
; GFX10-NEXT: buffer_load_ushort v0, s[0:3], 0 offen offset:16 glc lds
; GFX10-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: buffer_load_lds_ushort_vaddr_saddr:
; GFX942-GISEL: ; %bb.0: ; %main_body
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: buffer_load_ushort v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX942-GISEL-NEXT: s_endpgm
main_body:
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 2, i32 16, i32 1)
ret void
}
define amdgpu_ps void @global_load_lds_ubyte_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
; GFX90A-LABEL: global_load_lds_ubyte_vaddr_saddr:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_mov_b32 m0, s0
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_load_ubyte v[0:1], off offset:16 glc lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: global_load_lds_ubyte_vaddr_saddr:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: s_mov_b32 m0, s0
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_load_lds_ubyte v[0:1], off offset:16 sc0
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: global_load_lds_ubyte_vaddr_saddr:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 m0, s0
; GFX10-NEXT: global_load_ubyte v[0:1], off offset:16 glc lds
; GFX10-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: global_load_lds_ubyte_vaddr_saddr:
; GFX942-GISEL: ; %bb.0: ; %main_body
; GFX942-GISEL-NEXT: s_mov_b32 m0, s0
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: global_load_lds_ubyte v[0:1], off offset:16 sc0
; GFX942-GISEL-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 1, i32 16, i32 1)
ret void
}
define amdgpu_ps void @buffer_load_lds_ubyte_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
; GFX90A-LABEL: buffer_load_lds_ubyte_vaddr_saddr:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: s_mov_b32 m0, s5
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_ubyte v0, s[0:3], 0 offen offset:16 glc lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: buffer_load_lds_ubyte_vaddr_saddr:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: s_mov_b32 m0, s5
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: buffer_load_ubyte v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: buffer_load_lds_ubyte_vaddr_saddr:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
; GFX10-NEXT: s_mov_b32 m0, s5
; GFX10-NEXT: buffer_load_ubyte v0, s[0:3], 0 offen offset:16 glc lds
; GFX10-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: buffer_load_lds_ubyte_vaddr_saddr:
; GFX942-GISEL: ; %bb.0: ; %main_body
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: buffer_load_ubyte v0, s[0:3], 0 offen offset:16 sc0 lds
; GFX942-GISEL-NEXT: s_endpgm
main_body:
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 1, i32 16, i32 1)
ret void
}
define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) inreg %lptr) {
; GFX90A-LABEL: global_load_lds_dword_volatile:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_mov_b32 m0, s0
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_load_dword v[0:1], off glc lds
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_load_dword v[0:1], off offset:256 lds
; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: global_load_lds_dword_volatile:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: s_mov_b32 m0, s0
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:256
; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: global_load_lds_dword_volatile:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 m0, s0
; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v[0:1], off offset:256 lds
; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
; GFX10-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
; GFX942-GISEL: ; %bb.0: ; %main_body
; GFX942-GISEL-NEXT: s_mov_b32 m0, s0
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:256
; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
; GFX942-GISEL-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
ret void
}
define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
; GFX90A-LABEL: buffer_load_lds_dword_volatile:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_mov_b32 m0, s5
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: buffer_load_lds_dword_volatile:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: s_mov_b32 m0, s5
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: buffer_load_lds_dword_volatile:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
; GFX10-NEXT: s_mov_b32 m0, s5
; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
; GFX10-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: buffer_load_lds_dword_volatile:
; GFX942-GISEL: ; %bb.0: ; %main_body
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
; GFX942-GISEL-NEXT: s_endpgm
main_body:
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
ret void
}
define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) inreg %lptr) {
; GFX90A-LABEL: global_load_lds_dword_nontemporal:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_mov_b32 m0, s2
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_load_dword v0, s[0:1] glc slc lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: global_load_lds_dword_nontemporal:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_mov_b32 m0, s2
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_load_lds_dword v0, s[0:1] nt
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: global_load_lds_dword_nontemporal:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b32 m0, s2
; GFX10-NEXT: global_load_dword v0, s[0:1] slc lds
; GFX10-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
; GFX942-GISEL: ; %bb.0: ; %main_body
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: global_load_lds_dword v0, s[0:1] nt
; GFX942-GISEL-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
ret void
}
define amdgpu_ps void @buffer_load_lds_dword_nontemporal(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
; GFX90A-LABEL: buffer_load_lds_dword_nontemporal:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: s_mov_b32 m0, s5
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: buffer_load_lds_dword_nontemporal:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: s_mov_b32 m0, s5
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: buffer_load_lds_dword_nontemporal:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
; GFX10-NEXT: s_mov_b32 m0, s5
; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds
; GFX10-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: buffer_load_lds_dword_nontemporal:
; GFX942-GISEL: ; %bb.0: ; %main_body
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
; GFX942-GISEL-NEXT: s_endpgm
main_body:
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
ret void
}
!0 = !{i32 1}