[AMDGPU] Update codegen tests for PR #154069 (#154862)

This commit is contained in:
Anshil Gandhi 2025-08-22 07:58:21 -06:00 committed by GitHub
parent 37664cd991
commit 7dfd5ba811
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 723 additions and 193 deletions

View File

@ -1,12 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel %s -o - 2>&1 | FileCheck %s
; This file checks that the translation from llvm IR to generic
; MachineInstr is correct.
; Tests for add.
; CHECK: name: addi32
; CHECK: {{%[0-9]+}}:_(s32) = G_ADD
define amdgpu_kernel void @addi32(i32 %arg1, i32 %arg2) {
define void @addi32(i32 %arg1, i32 %arg2) {
; CHECK-LABEL: name: addi32
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
; CHECK-NEXT: G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
; CHECK-NEXT: SI_RETURN
%res = add i32 %arg1, %arg2
store i32 %res, ptr addrspace(1) poison
ret void

View File

@ -1,12 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
declare i32 @llvm.amdgcn.workitem.id.x() readnone
; SI-LABEL: {{^}}test_i64_vreg:
; SI: v_add_i32
; SI: v_addc_u32
define amdgpu_kernel void @test_i64_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
; SI-LABEL: test_i64_vreg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_mov_b64 s[6:7], s[14:15]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[12:13], s[2:3]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[12:15], 0 addr64
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
%a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid
%b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid
@ -18,10 +35,22 @@ define amdgpu_kernel void @test_i64_vreg(ptr addrspace(1) noalias %out, ptr addr
}
; Check that the SGPR add operand is correctly moved to a VGPR.
; SI-LABEL: {{^}}sgpr_operand:
; SI: s_add_u32
; SI: s_addc_u32
define amdgpu_kernel void @sgpr_operand(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in_bar, i64 %a) {
; SI-LABEL: sgpr_operand:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_u32 s4, s6, s4
; SI-NEXT: s_addc_u32 s5, s7, s5
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
%foo = load i64, ptr addrspace(1) %in, align 8
%result = add i64 %foo, %a
store i64 %result, ptr addrspace(1) %out
@ -30,35 +59,76 @@ define amdgpu_kernel void @sgpr_operand(ptr addrspace(1) noalias %out, ptr addrs
; Swap the arguments. Check that the SGPR -> VGPR copy works with the
; SGPR as other operand.
;
; SI-LABEL: {{^}}sgpr_operand_reversed:
; SI: s_add_u32
; SI: s_addc_u32
define amdgpu_kernel void @sgpr_operand_reversed(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %a) {
; SI-LABEL: sgpr_operand_reversed:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_u32 s4, s4, s6
; SI-NEXT: s_addc_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
%foo = load i64, ptr addrspace(1) %in, align 8
%result = add i64 %a, %foo
store i64 %result, ptr addrspace(1) %out
ret void
}
; SI-LABEL: {{^}}test_v2i64_sreg:
; SI: s_add_u32
; SI: s_addc_u32
; SI: s_add_u32
; SI: s_addc_u32
define amdgpu_kernel void @test_v2i64_sreg(ptr addrspace(1) noalias %out, <2 x i64> %a, <2 x i64> %b) {
; SI-LABEL: test_v2i64_sreg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_u32 s4, s10, s14
; SI-NEXT: s_addc_u32 s5, s11, s15
; SI-NEXT: s_add_u32 s6, s8, s12
; SI-NEXT: s_addc_u32 s7, s9, s13
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v2, s4
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
%result = add <2 x i64> %a, %b
store <2 x i64> %result, ptr addrspace(1) %out
ret void
}
; SI-LABEL: {{^}}test_v2i64_vreg:
; SI: v_add_i32
; SI: v_addc_u32
; SI: v_add_i32
; SI: v_addc_u32
define amdgpu_kernel void @test_v2i64_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
; SI-LABEL: test_v2i64_vreg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; SI-NEXT: v_mov_b32_e32 v5, 0
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_mov_b64 s[6:7], s[14:15]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[12:13], s[2:3]
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64
; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
%a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid
%b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid
@ -69,14 +139,19 @@ define amdgpu_kernel void @test_v2i64_vreg(ptr addrspace(1) noalias %out, ptr ad
ret void
}
; SI-LABEL: {{^}}trunc_i64_add_to_i32:
; SI: s_load_dword s[[SREG0:[0-9]+]]
; SI: s_load_dword s[[SREG1:[0-9]+]]
; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
; SI-NOT: addc
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
; SI: buffer_store_dword [[VRESULT]],
define amdgpu_kernel void @trunc_i64_add_to_i32(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b) {
; SI-LABEL: trunc_i64_add_to_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[4:5], 0xd
; SI-NEXT: s_load_dword s6, s[4:5], 0x11
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_i32 s4, s6, s2
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%add = add i64 %b, %a
%trunc = trunc i64 %add to i32
store i32 %trunc, ptr addrspace(1) %out, align 8

View File

@ -1,24 +1,94 @@
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI,SICI,SICIVI,PREGFX11,GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=SICI,CIVI,SICIVI,PREGFX11,GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=CIVI,SICIVI,GFX8PLUS,PREGFX11,GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9PLUS,GFX8PLUS,PREGFX11,GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11,GFX9PLUS,GFX8PLUS,GCN %s
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GFX7 %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11 %s
; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
; GFX9PLUS-NOT: m0
; SICIVI-DAG: s_mov_b32 m0
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; GFX8PLUS-DAG: s_load_{{dword|b32}} [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
; PREGFX11: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
; GFX11: ds_cmpstore_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VSWAP]], [[VCMP]] offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(ptr addrspace(1) %out, [8 x i32], ptr addrspace(3) %ptr, [8 x i32], i32 %swap) nounwind {
; CHECK-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dword s2, s[4:5], 0x13
; CHECK-NEXT: s_load_dword s3, s[4:5], 0x1c
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; CHECK-NEXT: v_mov_b32_e32 v0, 7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: v_mov_b32_e32 v2, s3
; CHECK-NEXT: s_mov_b32 m0, -1
; CHECK-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s3, 0xf000
; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CHECK-NEXT: s_endpgm
;
; GFX7-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x13
; GFX7-NEXT: s_load_dword s3, s[4:5], 0x1c
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX7-NEXT: v_mov_b32_e32 v0, 7
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s3
; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x4c
; GFX8-NEXT: s_load_dword s3, s[4:5], 0x70
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 7
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v1, v0, v2 offset:16
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x4c
; GFX9-NEXT: s_load_dword s3, s[4:5], 0x70
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x4c
; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x70
; GFX11-NEXT: v_mov_b32_e32 v0, 7
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v2, s1
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_cmpstore_rtn_b32 v0, v1, v2, v0 offset:16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
%pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic
%result = extractvalue { i32, i1 } %pair, 0
@ -26,24 +96,100 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(ptr addrspace(1) %o
ret void
}
; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
; GFX9PLUS-NOT: m0
; SICIVI-DAG: s_mov_b32 m0
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; GFX8PLUS-DAG: s_load_{{dwordx2|b64}} s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
; PREGFX11: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32
; GFX11: ds_cmpstore_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32
; GCN: [[RESULT]]
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i64 %swap) nounwind {
; CHECK-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb
; CHECK-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xd
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; CHECK-NEXT: v_mov_b32_e32 v0, 7
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v4, s6
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: v_mov_b32_e32 v3, s3
; CHECK-NEXT: s_mov_b32 m0, -1
; CHECK-NEXT: ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s3, 0xf000
; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CHECK-NEXT: s_endpgm
;
; GFX7-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dword s6, s[4:5], 0xb
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX7-NEXT: v_mov_b32_e32 v0, 7
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 7
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 7
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 7
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, s2
; GFX11-NEXT: v_mov_b32_e32 v3, s1
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_cmpstore_rtn_b64 v[0:1], v4, v[2:3], v[0:1] offset:32
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
%pair = cmpxchg ptr addrspace(3) %gep, i64 7, i64 %swap seq_cst monotonic
%result = extractvalue { i64, i1 } %pair, 0
@ -51,13 +197,103 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %o
ret void
}
; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
; GFX9PLUS-NOT: m0
; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GFX9PLUS: ds_{{cmpst|cmpstore}}_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
; CHECK-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_load_dword s3, s[4:5], 0xb
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; CHECK-NEXT: s_sub_i32 s1, s1, s2
; CHECK-NEXT: s_lshl_b32 s1, s1, 2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_add_i32 s1, s3, s1
; CHECK-NEXT: s_add_i32 s1, s1, 16
; CHECK-NEXT: v_mov_b32_e32 v0, 7
; CHECK-NEXT: v_mov_b32_e32 v1, s0
; CHECK-NEXT: v_mov_b32_e32 v2, s1
; CHECK-NEXT: s_mov_b32 m0, -1
; CHECK-NEXT: ds_cmpst_rtn_b32 v0, v2, v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s7, 0xf000
; CHECK-NEXT: s_mov_b32 s6, -1
; CHECK-NEXT: buffer_store_dword v0, off, s[4:7], 0
; CHECK-NEXT: s_endpgm
;
; GFX7-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GFX7-NEXT: v_mov_b32_e32 v0, 7
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_sub_i32 s2, s2, s3
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_lshl_b32 s1, s2, 2
; GFX7-NEXT: s_add_i32 s0, s0, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v2, v0, v1 offset:16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 7
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sub_i32 s2, s2, s3
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_lshl_b32 s1, s2, 2
; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v2, v0, v1 offset:16
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sub_i32 s2, s2, s3
; GFX9-NEXT: s_lshl_b32 s2, s2, 2
; GFX9-NEXT: s_add_i32 s0, s0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: v_mov_b32_e32 v0, 7
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: v_mov_b32_e32 v2, s1
; GFX11-NEXT: s_lshl_b32 s2, s2, 2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s0, s0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_cmpstore_rtn_b32 v0, v1, v2, v0 offset:16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%sub = sub i32 %a, %b
%add = add i32 %sub, 4
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 %add
@ -67,45 +303,152 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
ret void
}
; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
; GFX9PLUS-NOT: m0
; SICIVI-DAG: s_mov_b32 m0
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x12
; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
; GFX8PLUS-DAG: s_load_{{dword|b32}} [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x48
; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
; PREGFX11: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
; GFX11: ds_cmpstore_b32 [[VPTR]], [[VSWAP]], [[VCMP]] offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(ptr addrspace(3) %ptr, [8 x i32], i32 %swap) nounwind {
; CHECK-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x9
; CHECK-NEXT: s_load_dword s1, s[4:5], 0x12
; CHECK-NEXT: v_mov_b32_e32 v0, 7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s0
; CHECK-NEXT: v_mov_b32_e32 v2, s1
; CHECK-NEXT: s_mov_b32 m0, -1
; CHECK-NEXT: ds_cmpst_b32 v1, v0, v2 offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_endpgm
;
; GFX7-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dword s0, s[4:5], 0x9
; GFX7-NEXT: s_load_dword s1, s[4:5], 0x12
; GFX7-NEXT: v_mov_b32_e32 v0, 7
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: ds_cmpst_b32 v1, v0, v2 offset:16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x48
; GFX8-NEXT: v_mov_b32_e32 v0, 7
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: ds_cmpst_b32 v1, v0, v2 offset:16
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dword s1, s[4:5], 0x48
; GFX9-NEXT: v_mov_b32_e32 v0, 7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: ds_cmpst_b32 v1, v0, v2 offset:16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x48
; GFX11-NEXT: v_mov_b32_e32 v0, 7
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v2, s1
; GFX11-NEXT: ds_cmpstore_b32 v1, v2, v0 offset:16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
%pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic
%result = extractvalue { i32, i1 } %pair, 0
ret void
}
; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
; GFX9PLUS-NOT: m0
; SICIVI-DAG: s_mov_b32 m0
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
; GFX8PLUS-DAG: s_load_{{dwordx2|b64}} s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
; PREGFX11: ds_cmpst_b64 [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32
; GFX11: ds_cmpstore_b64 [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(ptr addrspace(3) %ptr, i64 %swap) nounwind {
; CHECK-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dword s2, s[4:5], 0x9
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
; CHECK-NEXT: v_mov_b32_e32 v0, 7
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v4, s2
; CHECK-NEXT: v_mov_b32_e32 v3, s1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: s_mov_b32 m0, -1
; CHECK-NEXT: ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_endpgm
;
; GFX7-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
; GFX7-NEXT: v_mov_b32_e32 v0, 7
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v4, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; GFX8-NEXT: v_mov_b32_e32 v0, 7
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v0, 7
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-NEXT: v_mov_b32_e32 v0, 7
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, s2
; GFX11-NEXT: v_mov_b32_e32 v3, s1
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: ds_cmpstore_b64 v4, v[2:3], v[0:1] offset:32
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
%pair = cmpxchg ptr addrspace(3) %gep, i64 7, i64 %swap seq_cst monotonic
%result = extractvalue { i64, i1 } %pair, 0

View File

@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; XUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
@ -7,123 +8,210 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readnone
declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone
; GCN-LABEL: {{^}}test_div_fmas_f32:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
; GCN-DAG: s_bitcmp1_b32 s{{[0-9]+}}, 0
; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]]
; GCN: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
; GCN-LABEL: test_div_fmas_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-NEXT: s_load_dword s6, s[4:5], 0x13
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2e
; GCN-NEXT: s_load_dword s7, s[4:5], 0x1c
; GCN-NEXT: s_load_dword s4, s[4:5], 0x25
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
; SI: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
; GCN-LABEL: test_div_fmas_f32_inline_imm_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2e
; GCN-NEXT: s_load_dword s6, s[4:5], 0x1c
; GCN-NEXT: s_load_dword s4, s[4:5], 0x25
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: v_div_fmas_f32 v0, 1.0, v0, v1
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
; GCN: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind {
; GCN-LABEL: test_div_fmas_f32_inline_imm_1:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-NEXT: s_load_dword s2, s[4:5], 0x16
; GCN-NEXT: s_load_dword s6, s[4:5], 0xb
; GCN-NEXT: s_load_dword s4, s[4:5], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: v_div_fmas_f32 v0, v0, 1.0, v1
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
; GCN: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
; GCN-LABEL: test_div_fmas_f32_inline_imm_2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2e
; GCN-NEXT: s_load_dword s6, s[4:5], 0x13
; GCN-NEXT: s_load_dword s4, s[4:5], 0x1c
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, 1.0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
; GCN-LABEL: {{^}}test_div_fmas_f64:
; GCN: v_div_fmas_f64
define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind {
; GCN-LABEL: test_div_fmas_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s8, s[4:5], 0x11
; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s8, 0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_mov_b32 s10, -1
; GCN-NEXT: s_mov_b32 s8, s0
; GCN-NEXT: s_mov_b32 s9, s1
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_mov_b32_e32 v5, s7
; GCN-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; GCN-NEXT: s_endpgm
%result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
store double %result, ptr addrspace(1) %out, align 8
ret void
}
; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %i) nounwind {
; GCN-LABEL: test_div_fmas_f32_cond_to_vcc:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s3, 0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%cmp = icmp eq i32 %i, 0
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
; GCN: s_mov_b64 vcc, 0
; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind {
; GCN-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: s_mov_b64 vcc, 0
; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
; GCN: s_mov_b64 vcc, -1
; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind {
; GCN-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: s_mov_b64 vcc, -1
; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc:
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
; SI-DAG: v_cmp_eq_u32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}}
; SI-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0{{$}}
; SI-DAG: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
; SI: s_endpgm
define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %d) nounwind {
; GCN-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_load_dword s8, s[4:5], 0xd
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
; GCN-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 offset:4 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 offset:8 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
; GCN-NEXT: s_and_b64 vcc, vcc, s[2:3]
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, s7
; GCN-NEXT: v_div_fmas_f32 v0, v0, v3, v1
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1
@ -143,26 +231,39 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
ret void
}
; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
; SI: ; %entry
; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
; SI: s_mov_b64 vcc, 0
; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[CMP]]
; SI: ; %bb
; SI: buffer_load_dword [[LOAD:v[0-9]+]],
; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]
; SI: s_and_b64 vcc, vcc, exec
; SI: ; %exit
; SI: s_or_b64 exec, exec, [[SAVE]]
; SI-NOT: vcc
; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; SI: buffer_store_dword
; SI: s_endpgm
define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) nounwind {
; GCN-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[0:1], s[10:11]
; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64
; GCN-NEXT: buffer_load_dword v3, v[3:4], s[0:3], 0 addr64 offset:8
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
; GCN-NEXT: s_mov_b64 vcc, 0
; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[0:1]
; GCN-NEXT: s_cbranch_execz .LBB9_2
; GCN-NEXT: ; %bb.1: ; %bb
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_mov_b32 s7, s3
; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GCN-NEXT: s_and_b64 vcc, vcc, exec
; GCN-NEXT: .LBB9_2: ; %exit
; GCN-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_div_fmas_f32 v0, v1, v2, v3
; GCN-NEXT: s_mov_b32 s10, -1
; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8
; GCN-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.out = getelementptr float, ptr addrspace(1) %out, i32 2
@ -188,3 +289,5 @@ exit:
store float %result, ptr addrspace(1) %gep.out, align 4
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; SI: {{.*}}