llvm-project/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
Fabian Ritter 01b4b2a5b8
[AMDGPU][SDAG] Handle ISD::PTRADD in VOP3 patterns (#143881)
This patch mirrors similar patterns for ISD::ADD. The main difference is
that ISD::ADD is commutative, so that a pattern definition for, e.g.,
(add (mul x, y), z), automatically also handles (add z, (mul x, y)).
ISD::PTRADD is not commutative, so we would need to handle these cases
explicitly. This patch only implements (ptradd z, (op x, y)) patterns,
where the nested operation (shift or multiply) is the offset of the
ptradd (i.e., the right operand), since base pointers that are the
result of a shift or multiply seem less likely.

For SWDEV-516125.
2025-09-18 15:01:07 +02:00

420 lines
16 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
define amdgpu_ps float @flat_load_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
; GCN-LABEL: flat_load_b32_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom
%ret = load float, ptr %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @flat_load_b32_idx32(ptr align 4 inreg %p, i32 %idx) {
; GCN-LABEL: flat_load_b32_idx32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%arrayidx = getelementptr inbounds float, ptr %p, i32 %idx
%ret = load float, ptr %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @flat_load_b32_idxprom_wrong_stride(ptr align 4 inreg %p, i32 %idx) {
; GCN-LABEL: flat_load_b32_idxprom_wrong_stride:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
; GCN-NEXT: flat_load_b32 v0, v[0:1]
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom
%ret = load float, ptr %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @flat_load_b16_idxprom_ioffset(ptr align 4 inreg %p, i32 %idx) {
; GCN-LABEL: flat_load_b16_idxprom_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd
%ld = load i16, ptr %arrayidx, align 2
%ret.i32 = zext i16 %ld to i32
%ret = bitcast i32 %ret.i32 to float
ret float %ret
}
define amdgpu_ps <2 x float> @flat_load_b64_idxprom(ptr align 4 inreg %p, i32 %idx) {
; GCN-LABEL: flat_load_b64_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom
%ret = load <2 x float>, ptr %arrayidx, align 4
ret <2 x float> %ret
}
define amdgpu_ps <3 x float> @flat_load_b96_idxprom(ptr align 4 inreg %p, i32 %idx) {
; GCN-LABEL: flat_load_b96_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom
%ret = load <3 x float>, ptr %arrayidx, align 4
ret <3 x float> %ret
}
define amdgpu_ps <3 x float> @flat_load_b96_idxpromi_ioffset(ptr align 4 inreg %p, i32 %idx) {
; GCN-LABEL: flat_load_b96_idxpromi_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd
%ret = load <3 x float>, ptr %arrayidx, align 4
ret <3 x float> %ret
}
define amdgpu_ps <4 x float> @flat_load_b128_idxprom(ptr align 4 inreg %p, i32 %idx) {
; GCN-LABEL: flat_load_b128_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom
%ret = load <4 x float>, ptr %arrayidx, align 4
ret <4 x float> %ret
}
define amdgpu_ps float @flat_load_b32_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
; GCN-LABEL: flat_load_b32_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b32 v0, v[0:1]
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom
%ret = load float, ptr %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @flat_load_b32_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
; GCN-LABEL: flat_load_b32_idxprom_range_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b32 v0, v[0:1]
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] offset:64 scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds float, ptr %p, i64 %idxadd
%ret = load float, ptr %arrayidx, align 4
ret float %ret
}
; Note: this is a byte load, there is nothing to scale
define amdgpu_ps float @flat_load_b8_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
; GCN-LABEL: flat_load_b8_idxprom_range_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b32 v0, v[0:1]
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: flat_load_u8 v0, v0, s[0:1] offset:16
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds i8, ptr %p, i64 %idxadd
%ld = load i8, ptr %arrayidx
%ret.i32 = zext i8 %ld to i32
%ret = bitcast i32 %ret.i32 to float
ret float %ret
}
define amdgpu_ps float @flat_load_b16_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
; GCN-LABEL: flat_load_b16_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b32 v0, v[0:1]
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom
%ld = load i16, ptr %arrayidx, align 2
%ret.i32 = zext i16 %ld to i32
%ret = bitcast i32 %ret.i32 to float
ret float %ret
}
define amdgpu_ps float @flat_load_b16_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
; GCN-LABEL: flat_load_b16_idxprom_range_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b32 v0, v[0:1]
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd
%ld = load i16, ptr %arrayidx, align 2
%ret.i32 = zext i16 %ld to i32
%ret = bitcast i32 %ret.i32 to float
ret float %ret
}
define amdgpu_ps <2 x float> @flat_load_b64_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
; GCN-LABEL: flat_load_b64_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b32 v0, v[0:1]
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom
%ret = load <2 x float>, ptr %arrayidx, align 4
ret <2 x float> %ret
}
define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
; GCN-LABEL: flat_load_b96_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b32 v0, v[0:1]
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom
%ret = load <3 x float>, ptr %arrayidx, align 4
ret <3 x float> %ret
}
define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
; GCN-LABEL: flat_load_b96_idxprom_range_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b32 v0, v[0:1]
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd
%ret = load <3 x float>, ptr %arrayidx, align 4
ret <3 x float> %ret
}
define amdgpu_ps <4 x float> @flat_load_b128_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
; GCN-LABEL: flat_load_b128_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_load_b32 v0, v[0:1]
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom
%ret = load <4 x float>, ptr %arrayidx, align 4
ret <4 x float> %ret
}
define amdgpu_ps void @flat_store_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
; GCN-LABEL: flat_store_b32_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1.0
; GCN-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom
store float 1.0, ptr %arrayidx, align 4
ret void
}
define amdgpu_ps void @flat_store_b16_idxprom(ptr align 2 inreg %p, i32 %idx) {
; GCN-LABEL: flat_store_b16_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1
; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom
store i16 1, ptr %arrayidx, align 2
ret void
}
define amdgpu_ps void @flat_store_b64_idxprom(ptr align 4 inreg %p, i32 %idx) {
; GCN-LABEL: flat_store_b64_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0
; GCN-NEXT: flat_store_b64 v0, v[2:3], s[0:1] scale_offset
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds double, ptr %p, i64 %idxprom
store double 1.0, ptr %arrayidx, align 4
ret void
}
define amdgpu_ps void @flat_atomicrmw_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
; GCN-LABEL: flat_atomicrmw_b32_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1
; GCN-NEXT: flat_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds i32, ptr %p, i64 %idxprom
atomicrmw add ptr %arrayidx, i32 1 monotonic
ret void
}
define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %p, i32 %idx) {
; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom:
; SDAG: ; %bb.0: ; %entry
; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1]
; SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; SDAG-NEXT: v_xor_b32_e32 v0, s0, v3
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; SDAG-NEXT: s_cbranch_execnz .LBB21_3
; SDAG-NEXT: ; %bb.1: ; %Flow
; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; SDAG-NEXT: s_cbranch_execnz .LBB21_4
; SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi
; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; SDAG-NEXT: s_branch .LBB21_5
; SDAG-NEXT: .LBB21_3: ; %atomicrmw.global
; SDAG-NEXT: v_mov_b64_e32 v[0:1], 1
; SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; SDAG-NEXT: s_wait_xcnt 0x0
; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; SDAG-NEXT: s_cbranch_execz .LBB21_2
; SDAG-NEXT: .LBB21_4: ; %atomicrmw.private
; SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; SDAG-NEXT: s_wait_loadcnt 0x0
; SDAG-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; SDAG-NEXT: s_wait_xcnt 0x0
; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; SDAG-NEXT: s_branch .LBB21_5
; SDAG-NEXT: .LBB21_5:
;
; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v0, src_flat_scratch_base_hi
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GISEL-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 3, s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_xor_b32_e32 v0, v5, v0
; GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_xor_b32 s2, exec_lo, s2
; GISEL-NEXT: s_cbranch_execnz .LBB21_3
; GISEL-NEXT: ; %bb.1: ; %Flow
; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2
; GISEL-NEXT: s_cbranch_execnz .LBB21_4
; GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi
; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GISEL-NEXT: s_branch .LBB21_5
; GISEL-NEXT: .LBB21_3: ; %atomicrmw.global
; GISEL-NEXT: v_mov_b64_e32 v[0:1], 1
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GISEL-NEXT: flat_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GISEL-NEXT: s_wait_xcnt 0x0
; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2
; GISEL-NEXT: s_cbranch_execz .LBB21_2
; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_sub_nc_u32_e32 v0, v4, v0
; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GISEL-NEXT: s_wait_loadcnt 0x0
; GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off
; GISEL-NEXT: s_wait_xcnt 0x0
; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GISEL-NEXT: s_branch .LBB21_5
; GISEL-NEXT: .LBB21_5:
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds i64, ptr %p, i64 %idxprom
%ret = atomicrmw add ptr %arrayidx, i64 1 monotonic
%ret.cast = bitcast i64 %ret to <2 x float>
ret <2 x float> %ret.cast
}
!0 = !{i32 0, i32 1024}