This patch mirrors similar patterns for ISD::ADD. The main difference is that ISD::ADD is commutative, so that a pattern definition for, e.g., (add (mul x, y), z), automatically also handles (add z, (mul x, y)). ISD::PTRADD is not commutative, so we would need to handle these cases explicitly. This patch only implements (ptradd z, (op x, y)) patterns, where the nested operation (shift or multiply) is the offset of the ptradd (i.e., the right operand), since base pointers that are the result of a shift or multiply seem less likely. For SWDEV-516125.
342 lines
14 KiB
LLVM
342 lines
14 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
|
|
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
|
|
|
|
define amdgpu_ps float @global_load_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
|
|
; GCN-LABEL: global_load_b32_idxprom:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
|
|
%ret = load float, ptr addrspace(1) %arrayidx, align 4
|
|
ret float %ret
|
|
}
|
|
|
|
define amdgpu_ps float @global_load_b32_idx32(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
|
|
; GCN-LABEL: global_load_b32_idx32:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i32 %idx
|
|
%ret = load float, ptr addrspace(1) %arrayidx, align 4
|
|
ret float %ret
|
|
}
|
|
|
|
define amdgpu_ps float @global_load_b32_idxprom_wrong_stride(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
|
|
; GCN-LABEL: global_load_b32_idxprom_wrong_stride:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
|
|
; GCN-NEXT: global_load_b32 v0, v[0:1], off
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
|
|
%ret = load float, ptr addrspace(1) %arrayidx, align 4
|
|
ret float %ret
|
|
}
|
|
|
|
define amdgpu_ps float @global_load_b16_idxprom_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
|
|
; GCN-LABEL: global_load_b16_idxprom_ioffset:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idxprom = sext i32 %idx to i64
|
|
%idxadd = add i64 %idxprom, 16
|
|
%arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd
|
|
%ld = load i16, ptr addrspace(1) %arrayidx, align 2
|
|
%ret.i32 = zext i16 %ld to i32
|
|
%ret = bitcast i32 %ret.i32 to float
|
|
ret float %ret
|
|
}
|
|
|
|
define amdgpu_ps <2 x float> @global_load_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
|
|
; GCN-LABEL: global_load_b64_idxprom:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
|
|
%ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4
|
|
ret <2 x float> %ret
|
|
}
|
|
|
|
define amdgpu_ps <3 x float> @global_load_b96_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
|
|
; GCN-LABEL: global_load_b96_idxprom:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom
|
|
%ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
|
|
ret <3 x float> %ret
|
|
}
|
|
|
|
define amdgpu_ps <3 x float> @global_load_b96_idxpromi_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
|
|
; GCN-LABEL: global_load_b96_idxpromi_ioffset:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idxprom = sext i32 %idx to i64
|
|
%idxadd = add i64 %idxprom, 16
|
|
%arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd
|
|
%ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
|
|
ret <3 x float> %ret
|
|
}
|
|
|
|
define amdgpu_ps <4 x float> @global_load_b128_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
|
|
; GCN-LABEL: global_load_b128_idxprom:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom
|
|
%ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4
|
|
ret <4 x float> %ret
|
|
}
|
|
|
|
define amdgpu_ps float @global_load_b32_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
|
|
; GCN-LABEL: global_load_b32_idxprom_range:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b32 v0, v[0:1], off
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
|
|
%ret = load float, ptr addrspace(1) %arrayidx, align 4
|
|
ret float %ret
|
|
}
|
|
|
|
define amdgpu_ps float @global_load_b32_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
|
|
; GCN-LABEL: global_load_b32_idxprom_range_ioffset:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b32 v0, v[0:1], off
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: global_load_b32 v0, v0, s[0:1] offset:64 scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
|
|
%idxprom = sext i32 %idx to i64
|
|
%idxadd = add i64 %idxprom, 16
|
|
%arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxadd
|
|
%ret = load float, ptr addrspace(1) %arrayidx, align 4
|
|
ret float %ret
|
|
}
|
|
|
|
; Note: this is a byte load, there is nothing to scale
|
|
|
|
define amdgpu_ps float @global_load_b8_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
|
|
; GCN-LABEL: global_load_b8_idxprom_range_ioffset:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b32 v0, v[0:1], off
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: global_load_u8 v0, v0, s[0:1] offset:16
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
|
|
%idxprom = sext i32 %idx to i64
|
|
%idxadd = add i64 %idxprom, 16
|
|
%arrayidx = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %idxadd
|
|
%ld = load i8, ptr addrspace(1) %arrayidx
|
|
%ret.i32 = zext i8 %ld to i32
|
|
%ret = bitcast i32 %ret.i32 to float
|
|
ret float %ret
|
|
}
|
|
|
|
define amdgpu_ps float @global_load_b16_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
|
|
; GCN-LABEL: global_load_b16_idxprom_range:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b32 v0, v[0:1], off
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: global_load_u16 v0, v0, s[0:1] scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom
|
|
%ld = load i16, ptr addrspace(1) %arrayidx, align 2
|
|
%ret.i32 = zext i16 %ld to i32
|
|
%ret = bitcast i32 %ret.i32 to float
|
|
ret float %ret
|
|
}
|
|
|
|
define amdgpu_ps float @global_load_b16_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
|
|
; GCN-LABEL: global_load_b16_idxprom_range_ioffset:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b32 v0, v[0:1], off
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
|
|
%idxprom = sext i32 %idx to i64
|
|
%idxadd = add i64 %idxprom, 16
|
|
%arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd
|
|
%ld = load i16, ptr addrspace(1) %arrayidx, align 2
|
|
%ret.i32 = zext i16 %ld to i32
|
|
%ret = bitcast i32 %ret.i32 to float
|
|
ret float %ret
|
|
}
|
|
|
|
define amdgpu_ps <2 x float> @global_load_b64_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
|
|
; GCN-LABEL: global_load_b64_idxprom_range:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b32 v0, v[0:1], off
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
|
|
%ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4
|
|
ret <2 x float> %ret
|
|
}
|
|
|
|
define amdgpu_ps <3 x float> @global_load_b96_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
|
|
; GCN-LABEL: global_load_b96_idxprom_range:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b32 v0, v[0:1], off
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom
|
|
%ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
|
|
ret <3 x float> %ret
|
|
}
|
|
|
|
define amdgpu_ps <3 x float> @global_load_b96_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
|
|
; GCN-LABEL: global_load_b96_idxprom_range_ioffset:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b32 v0, v[0:1], off
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
|
|
%idxprom = sext i32 %idx to i64
|
|
%idxadd = add i64 %idxprom, 16
|
|
%arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd
|
|
%ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
|
|
ret <3 x float> %ret
|
|
}
|
|
|
|
define amdgpu_ps <4 x float> @global_load_b128_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
|
|
; GCN-LABEL: global_load_b128_idxprom_range:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: global_load_b32 v0, v[0:1], off
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom
|
|
%ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4
|
|
ret <4 x float> %ret
|
|
}
|
|
|
|
define amdgpu_ps void @global_store_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
|
|
; GCN-LABEL: global_store_b32_idxprom:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 1.0
|
|
; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
|
|
; GCN-NEXT: s_endpgm
|
|
entry:
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
|
|
store float 1.0, ptr addrspace(1) %arrayidx, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @global_store_b16_idxprom(ptr addrspace(1) align 2 inreg %p, i32 %idx) {
|
|
; GCN-LABEL: global_store_b16_idxprom:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 1
|
|
; GCN-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset
|
|
; GCN-NEXT: s_endpgm
|
|
entry:
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom
|
|
store i16 1, ptr addrspace(1) %arrayidx, align 2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @global_store_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
|
|
; GCN-LABEL: global_store_b64_idxprom:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0
|
|
; GCN-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
|
|
; GCN-NEXT: s_endpgm
|
|
entry:
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds double, ptr addrspace(1) %p, i64 %idxprom
|
|
store double 1.0, ptr addrspace(1) %arrayidx, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @global_atomicrmw_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
|
|
; GCN-LABEL: global_atomicrmw_b32_idxprom:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 1
|
|
; GCN-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
|
|
; GCN-NEXT: s_endpgm
|
|
entry:
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %idxprom
|
|
atomicrmw add ptr addrspace(1) %arrayidx, i32 1 monotonic
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps <2 x float> @global_atomicrmw_b64_rtn_idxprom(ptr addrspace(1) align 8 inreg %p, i32 %idx) {
|
|
; GCN-LABEL: global_atomicrmw_b64_rtn_idxprom:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: v_mov_b64_e32 v[2:3], 1
|
|
; GCN-NEXT: global_atomic_add_u64 v[0:1], v0, v[2:3], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS
|
|
; GCN-NEXT: s_wait_loadcnt 0x0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
entry:
|
|
%idxprom = sext i32 %idx to i64
|
|
%arrayidx = getelementptr inbounds i64, ptr addrspace(1) %p, i64 %idxprom
|
|
%ret = atomicrmw add ptr addrspace(1) %arrayidx, i64 1 monotonic
|
|
%ret.cast = bitcast i64 %ret to <2 x float>
|
|
ret <2 x float> %ret.cast
|
|
}
|
|
|
|
!0 = !{i32 0, i32 1024}
|
|
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
|
; GISEL: {{.*}}
|
|
; SDAG: {{.*}}
|