llvm-project/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
Fabian Ritter 01b4b2a5b8
[AMDGPU][SDAG] Handle ISD::PTRADD in VOP3 patterns (#143881)
This patch mirrors similar patterns for ISD::ADD. The main difference is
that ISD::ADD is commutative, so that a pattern definition for, e.g.,
(add (mul x, y), z), automatically also handles (add z, (mul x, y)).
ISD::PTRADD is not commutative, so we would need to handle these cases
explicitly. This patch only implements (ptradd z, (op x, y)) patterns,
where the nested operation (shift or multiply) is the offset of the
ptradd (i.e., the right operand), since base pointers that are the
result of a shift or multiply seem less likely.

For SWDEV-516125.
2025-09-18 15:01:07 +02:00

342 lines
14 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
define amdgpu_ps float @global_load_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: global_load_b32_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
%ret = load float, ptr addrspace(1) %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @global_load_b32_idx32(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: global_load_b32_idx32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i32 %idx
%ret = load float, ptr addrspace(1) %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @global_load_b32_idxprom_wrong_stride(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: global_load_b32_idxprom_wrong_stride:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
; GCN-NEXT: global_load_b32 v0, v[0:1], off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
%ret = load float, ptr addrspace(1) %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @global_load_b16_idxprom_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: global_load_b16_idxprom_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd
%ld = load i16, ptr addrspace(1) %arrayidx, align 2
%ret.i32 = zext i16 %ld to i32
%ret = bitcast i32 %ret.i32 to float
ret float %ret
}
define amdgpu_ps <2 x float> @global_load_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: global_load_b64_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
%ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4
ret <2 x float> %ret
}
define amdgpu_ps <3 x float> @global_load_b96_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: global_load_b96_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom
%ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
ret <3 x float> %ret
}
define amdgpu_ps <3 x float> @global_load_b96_idxpromi_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: global_load_b96_idxpromi_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd
%ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
ret <3 x float> %ret
}
define amdgpu_ps <4 x float> @global_load_b128_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: global_load_b128_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom
%ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4
ret <4 x float> %ret
}
define amdgpu_ps float @global_load_b32_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
; GCN-LABEL: global_load_b32_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b32 v0, v[0:1], off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
%ret = load float, ptr addrspace(1) %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @global_load_b32_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
; GCN-LABEL: global_load_b32_idxprom_range_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b32 v0, v[0:1], off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: global_load_b32 v0, v0, s[0:1] offset:64 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxadd
%ret = load float, ptr addrspace(1) %arrayidx, align 4
ret float %ret
}
; Note: this is a byte load, there is nothing to scale
define amdgpu_ps float @global_load_b8_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
; GCN-LABEL: global_load_b8_idxprom_range_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b32 v0, v[0:1], off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: global_load_u8 v0, v0, s[0:1] offset:16
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %idxadd
%ld = load i8, ptr addrspace(1) %arrayidx
%ret.i32 = zext i8 %ld to i32
%ret = bitcast i32 %ret.i32 to float
ret float %ret
}
define amdgpu_ps float @global_load_b16_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
; GCN-LABEL: global_load_b16_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b32 v0, v[0:1], off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: global_load_u16 v0, v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom
%ld = load i16, ptr addrspace(1) %arrayidx, align 2
%ret.i32 = zext i16 %ld to i32
%ret = bitcast i32 %ret.i32 to float
ret float %ret
}
define amdgpu_ps float @global_load_b16_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
; GCN-LABEL: global_load_b16_idxprom_range_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b32 v0, v[0:1], off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd
%ld = load i16, ptr addrspace(1) %arrayidx, align 2
%ret.i32 = zext i16 %ld to i32
%ret = bitcast i32 %ret.i32 to float
ret float %ret
}
define amdgpu_ps <2 x float> @global_load_b64_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
; GCN-LABEL: global_load_b64_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b32 v0, v[0:1], off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
%ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4
ret <2 x float> %ret
}
define amdgpu_ps <3 x float> @global_load_b96_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
; GCN-LABEL: global_load_b96_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b32 v0, v[0:1], off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom
%ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
ret <3 x float> %ret
}
define amdgpu_ps <3 x float> @global_load_b96_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
; GCN-LABEL: global_load_b96_idxprom_range_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b32 v0, v[0:1], off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd
%ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
ret <3 x float> %ret
}
define amdgpu_ps <4 x float> @global_load_b128_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
; GCN-LABEL: global_load_b128_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: global_load_b32 v0, v[0:1], off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom
%ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4
ret <4 x float> %ret
}
define amdgpu_ps void @global_store_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: global_store_b32_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1.0
; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
store float 1.0, ptr addrspace(1) %arrayidx, align 4
ret void
}
define amdgpu_ps void @global_store_b16_idxprom(ptr addrspace(1) align 2 inreg %p, i32 %idx) {
; GCN-LABEL: global_store_b16_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1
; GCN-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom
store i16 1, ptr addrspace(1) %arrayidx, align 2
ret void
}
define amdgpu_ps void @global_store_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: global_store_b64_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0
; GCN-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds double, ptr addrspace(1) %p, i64 %idxprom
store double 1.0, ptr addrspace(1) %arrayidx, align 4
ret void
}
define amdgpu_ps void @global_atomicrmw_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: global_atomicrmw_b32_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1
; GCN-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %idxprom
atomicrmw add ptr addrspace(1) %arrayidx, i32 1 monotonic
ret void
}
define amdgpu_ps <2 x float> @global_atomicrmw_b64_rtn_idxprom(ptr addrspace(1) align 8 inreg %p, i32 %idx) {
; GCN-LABEL: global_atomicrmw_b64_rtn_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b64_e32 v[2:3], 1
; GCN-NEXT: global_atomic_add_u64 v[0:1], v0, v[2:3], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds i64, ptr addrspace(1) %p, i64 %idxprom
%ret = atomicrmw add ptr addrspace(1) %arrayidx, i64 1 monotonic
%ret.cast = bitcast i64 %ret to <2 x float>
ret <2 x float> %ret.cast
}
!0 = !{i32 0, i32 1024}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GISEL: {{.*}}
; SDAG: {{.*}}