[AMDGPU][True16][CodeGen] enable true16 for more codegen test patch 1 (#131206)

This is a NFC patch.

Enable true16 mode for more CodeGen tests
This commit is contained in:
Brox Chen 2025-03-14 14:57:42 -04:00 committed by GitHub
parent 7598ceaf51
commit 0b688f3ce1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 10492 additions and 4251 deletions

View File

@ -4,8 +4,10 @@
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define i16 @abs_i16(i16 %arg) {
; GFX6-LABEL: abs_i16:
@ -45,25 +47,45 @@ define i16 @abs_i16(i16 %arg) {
; GFX10-NEXT: v_max_i16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: abs_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sub_nc_u16 v1, 0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_i16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: abs_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: abs_i16:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_sub_nc_u16 v1, 0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_i16 v0, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-LABEL: abs_i16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_sub_nc_u16 v1, 0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_max_i16 v0, v0, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-TRUE16-LABEL: abs_i16:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v0.h
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: abs_i16:
; GFX12-FAKE16: ; %bb.0:
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: v_sub_nc_u16 v1, 0, v0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_max_i16 v0, v0, v1
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
}

View File

@ -2,7 +2,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; FIXME: Need to handle non-uniform case for function below (load without gep).
; FIXME: VI or should be unnecessary
@ -753,26 +754,49 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX10-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
; GFX11-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_test_add_v2i16_zext_to_v2i64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, 0, 16, v2
; GFX11-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_test_add_v2i16_zext_to_v2i64:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, 0, v0, 16
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
%gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid

View File

@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @br_cc_f16(
; SI-LABEL: br_cc_f16:
@ -60,32 +61,62 @@ define amdgpu_kernel void @br_cc_f16(
; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: br_cc_f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[8:9], s[4:5], 0x34
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s6
; GFX11-NEXT: s_mov_b32 s11, s7
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s4, s2
; GFX11-NEXT: s_mov_b32 s5, s3
; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s2, s6
; GFX11-NEXT: s_mov_b32 s3, s7
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: s_cbranch_vccnz .LBB0_2
; GFX11-NEXT: ; %bb.1: ; %one
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB0_2: ; %two
; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: br_cc_f16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[8:9], s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s3
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v2.l, v2.h
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB0_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %one
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
; GFX11-TRUE16-NEXT: .LBB0_2: ; %two
; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: br_cc_f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[8:9], s[4:5], 0x34
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s3
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB0_2
; GFX11-FAKE16-NEXT: ; %bb.1: ; %one
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_endpgm
; GFX11-FAKE16-NEXT: .LBB0_2: ; %two
; GFX11-FAKE16-NEXT: buffer_store_b16 v1, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
@ -151,25 +182,47 @@ define amdgpu_kernel void @br_cc_f16_imm_a(
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: br_cc_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s4, s2
; GFX11-NEXT: s_mov_b32 s5, s3
; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-NEXT: ; %bb.1: ; %one
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX11-NEXT: .LBB1_2: ; %two
; GFX11-NEXT: s_mov_b32 s2, s6
; GFX11-NEXT: s_mov_b32 s3, s7
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: br_cc_f16_imm_a:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s3
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v1.l
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %one
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX11-TRUE16-NEXT: .LBB1_2: ; %two
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: br_cc_f16_imm_a:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s3
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v0
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-FAKE16-NEXT: ; %bb.1: ; %one
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX11-FAKE16-NEXT: .LBB1_2: ; %two
; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b) {
entry:
@ -235,25 +288,47 @@ define amdgpu_kernel void @br_cc_f16_imm_b(
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: br_cc_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s4, s2
; GFX11-NEXT: s_mov_b32 s5, s3
; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: s_cbranch_vccz .LBB2_2
; GFX11-NEXT: ; %bb.1: ; %two
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX11-NEXT: .LBB2_2: ; %one
; GFX11-NEXT: s_mov_b32 s2, s6
; GFX11-NEXT: s_mov_b32 s3, s7
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: br_cc_f16_imm_b:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s3
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v1.l
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %two
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX11-TRUE16-NEXT: .LBB2_2: ; %one
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: br_cc_f16_imm_b:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s3
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v0
; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB2_2
; GFX11-FAKE16-NEXT: ; %bb.1: ; %two
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX11-FAKE16-NEXT: .LBB2_2: ; %one
; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:

View File

@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
; SI-LABEL: extract_vector_elt_v2f16:
@ -215,19 +216,34 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: extract_vector_elt_v3f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s5, s1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: extract_vector_elt_v3f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: extract_vector_elt_v3f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
; GFX11-FAKE16-NEXT: s_endpgm
%p0 = extractelement <3 x half> %foo, i32 0
%p1 = extractelement <3 x half> %foo, i32 2
%out1 = getelementptr half, ptr addrspace(1) %out, i32 1
@ -268,20 +284,35 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: dynamic_extract_vector_elt_v3f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b32 s4, s6, 4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: dynamic_extract_vector_elt_v3f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s6, 4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: dynamic_extract_vector_elt_v3f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s6, 4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_endpgm
%p0 = extractelement <3 x half> %foo, i32 %idx
%out1 = getelementptr half, ptr addrspace(1) %out, i32 1
store half %p0, ptr addrspace(1) %out
@ -635,48 +666,91 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1)
; VI-NEXT: flat_store_short v[5:6], v0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: v_extractelement_v8f16_dynamic_sgpr:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
; GFX11-NEXT: s_cmp_eq_u32 s4, 1
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s4, 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-NEXT: s_cmp_eq_u32 s4, 3
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s4, 4
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s4, 5
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s4, 6
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX11-NEXT: s_cmp_eq_u32 s4, 7
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_extractelement_v8f16_dynamic_sgpr:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 4, v4
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v0, s[2:3]
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 1
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v5.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 3
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v5.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 5
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 7
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_extractelement_v8f16_dynamic_sgpr:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 4, v4
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v0, s[2:3]
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 1
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 3
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 4
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 5
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 6
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 1, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 7
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-FAKE16-NEXT: global_store_b16 v2, v0, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
@ -852,83 +926,161 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1)
; VI-NEXT: flat_store_short v[9:10], v0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: v_extractelement_v16f16_dynamic_sgpr:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] offset:16
; GFX11-NEXT: s_cmp_eq_u32 s4, 1
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s4, 2
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1
; GFX11-NEXT: s_cmp_eq_u32 s4, 3
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s4, 4
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s4, 5
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s4, 6
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX11-NEXT: s_cmp_eq_u32 s4, 7
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s4, 8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GFX11-NEXT: s_cmp_eq_u32 s4, 9
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s4, 10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5
; GFX11-NEXT: s_cmp_eq_u32 s4, 11
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s4, 12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v6
; GFX11-NEXT: s_cmp_eq_u32 s4, 13
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s4, 14
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7
; GFX11-NEXT: s_cmp_eq_u32 s4, 15
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_extractelement_v16f16_dynamic_sgpr:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 5, v8
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v4, s[2:3]
; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v4, s[2:3] offset:16
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 1
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v9.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 3
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v9.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 5
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v8
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 7
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 8
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 9
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 10
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v5
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 11
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v5.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 12
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v6
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 13
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v6.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 14
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v7
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 15
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v7.l, s2
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_extractelement_v16f16_dynamic_sgpr:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 5, v8
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v4, s[2:3]
; GFX11-FAKE16-NEXT: global_load_b128 v[4:7], v4, s[2:3] offset:16
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 1
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 3
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 4
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 5
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 6
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 1, v8
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 7
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 8
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 9
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 10
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v5
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 11
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 12
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v6
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 13
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 14
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v7
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 15
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-FAKE16-NEXT: global_store_b16 v2, v0, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext

View File

@ -2,7 +2,8 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
; DAGCombiner will transform:
; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
@ -44,17 +45,30 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: s_fabs_free_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: s_fabs_free_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: s_fabs_free_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0x7fff
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%bc= bitcast i16 %in to half
%fabs = call half @llvm.fabs.f16(half %bc)
store half %fabs, ptr addrspace(1) %out
@ -97,17 +111,30 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: s_fabs_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: s_fabs_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: s_fabs_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0x7fff
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call half @llvm.fabs.f16(half %in)
store half %fabs, ptr addrspace(1) %out
ret void
@ -262,18 +289,33 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_fold_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: fabs_fold_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, |v0.l|, v0.h
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fabs_fold_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mul_f16_e64 v1, |s2|, s3
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call half @llvm.fabs.f16(half %in0)
%fmul = fmul half %fabs, %in1
store half %fmul, ptr addrspace(1) %out
@ -624,24 +666,43 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_extract_fabs_fold_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, 4.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_add_f16_e64 v1, |v1|, 2.0
; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_extract_fabs_fold_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, |v0.l|, 4.0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.h, |v1.l|, 2.0
; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v0, off dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_extract_fabs_fold_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, |v0|, 4.0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_add_f16_e64 v1, |v1|, 2.0
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v0, off dlc
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v1, off dlc
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
%val = load <2 x half>, ptr addrspace(1) %gep.in

File diff suppressed because it is too large Load Diff

View File

@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
; Make sure fdiv is promoted to f32.
@ -129,39 +130,75 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_fdiv_f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v3
; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v2, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_fdiv_f16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v3
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v4, v6 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v3
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v4, v6 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v4
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v1.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_fdiv_f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v3
; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v4, v5, v3
; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_div_fixup_f16 v1, v3, v2, v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) #0 {
@ -249,18 +286,31 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_rcp_f16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_rcp_f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -343,18 +393,31 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_abs:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e64 v1, |v1|
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_rcp_f16_abs:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e64 v0.l, |v0.l|
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_rcp_f16_abs:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_rcp_f16_e64 v1, |v1|
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -440,18 +503,31 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: reciprocal_f16_rounded:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: reciprocal_f16_rounded:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: reciprocal_f16_rounded:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -521,18 +597,31 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_afn:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_rcp_f16_afn:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_rcp_f16_afn:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -615,18 +704,31 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_neg:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e64 v1, -v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_rcp_f16_neg:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e64 v0.l, -v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_rcp_f16_neg:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_rcp_f16_e64 v1, -v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -712,18 +814,31 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rsq_f16_e32 v1, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_rsq_f16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_rsq_f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -813,20 +928,35 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_neg:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rsq_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_rsq_f16_neg:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_rsq_f16_neg:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -921,20 +1051,35 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
; GFX10-NEXT: global_store_short v0, v2, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_multi_use:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rsq_f16_e32 v2, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_rsq_f16_multi_use:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.h, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_rsq_f16_multi_use:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v2, v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] dlc
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -1025,20 +1170,35 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_missing_contract0:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_rsq_f16_missing_contract0:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_rsq_f16_missing_contract0:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -1128,20 +1288,35 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_missing_contract1:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_rsq_f16_missing_contract1:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_rsq_f16_missing_contract1:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -1231,20 +1406,35 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r,
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e64 v1, -v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-TRUE16-NEXT: v_rcp_f16_e64 v0.l, -v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FAKE16-NEXT: v_rcp_f16_e64 v1, -v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -1337,24 +1527,43 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1)
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_fdiv_f16_afn:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_fdiv_f16_afn:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[4:5] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.h, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_fdiv_f16_afn:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v2, v2
; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -1448,24 +1657,43 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_fdiv_f16_unsafe:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_fdiv_f16_unsafe:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[4:5] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.h, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_fdiv_f16_unsafe:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v2, v2
; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -1528,16 +1756,27 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX10-NEXT: global_store_short v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: div_afn_2_x_pat_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: div_afn_2_x_pat_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.5, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: div_afn_2_x_pat_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, 0.5, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%x = load half, ptr addrspace(1) poison
%rcp = fdiv afn half %x, 2.0
store half %rcp, ptr addrspace(1) %out, align 4
@ -1593,16 +1832,27 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX10-NEXT: global_store_short v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: div_afn_k_x_pat_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, 0x2e66, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: div_afn_k_x_pat_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x2e66, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: div_afn_k_x_pat_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, 0x2e66, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%x = load half, ptr addrspace(1) poison
%rcp = fdiv afn half %x, 10.0
store half %rcp, ptr addrspace(1) %out, align 4
@ -1658,16 +1908,27 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX10-NEXT: global_store_short v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: div_afn_neg_k_x_pat_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, 0xae66, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: div_afn_neg_k_x_pat_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0xae66, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: div_afn_neg_k_x_pat_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, 0xae66, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%x = load half, ptr addrspace(1) poison
%rcp = fdiv afn half %x, -10.0
store half %rcp, ptr addrspace(1) %out, align 4
@ -1719,13 +1980,21 @@ define half @v_fdiv_f16_arcp(half %x, half %y) {
; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_f16_arcp:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: v_fdiv_f16_arcp:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v1.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_fdiv_f16_arcp:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp half %x, %y
ret half %fdiv
}
@ -1763,13 +2032,21 @@ define half @v_fdiv_f16_afn_nsz(half %x, half %y) {
; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_f16_afn_nsz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: v_fdiv_f16_afn_nsz:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v1.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_fdiv_f16_afn_nsz:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn nsz half %x, %y
ret half %fdiv
}
@ -1924,16 +2201,27 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rsq_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_rsq_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_rsq_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: v_rsq_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.h, v1.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_rsq_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX9-IEEE-LABEL: v_rsq_v2f16:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@ -2125,16 +2413,27 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX10-NEXT: v_pack_b32_f16 v0, -v0, -v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_neg_rsq_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_rsq_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_rsq_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_pack_b32_f16 v0, -v0, -v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: v_neg_rsq_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.h, v1.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, -v0.l, -v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_neg_rsq_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, -v0, -v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX9-IEEE-LABEL: v_neg_rsq_v2f16:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

View File

@ -7,8 +7,10 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-FAKE16
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-TRUE16
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-FAKE16
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12-SDAG-TRUE16
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12-SDAG-FAKE16
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12-GISEL-TRUE16
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12-GISEL-FAKE16
declare half @llvm.fma.f16(half, half, half)
declare half @llvm.maxnum.f16(half, half)
@ -52,15 +54,47 @@ define half @test_fma(half %x, half %y, half %z) {
; GFX11-GISEL-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fma:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-SDAG-TRUE16-LABEL: test_fma:
; GFX12-SDAG-TRUE16: ; %bb.0:
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, v2.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_fma:
; GFX12-SDAG-FAKE16: ; %bb.0:
; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-TRUE16-LABEL: test_fma:
; GFX12-GISEL-TRUE16: ; %bb.0:
; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX12-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-FAKE16-LABEL: test_fma:
; GFX12-GISEL-FAKE16: ; %bb.0:
; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX12-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
%r = call half @llvm.fma.f16(half %x, half %y, half %z)
ret half %r
}
@ -103,15 +137,45 @@ define half @test_fmac(half %x, half %y, half %z) {
; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmac:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_fmac_f16_e32 v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-SDAG-TRUE16-LABEL: test_fmac:
; GFX12-SDAG-TRUE16: ; %bb.0:
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v2.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_fmac:
; GFX12-SDAG-FAKE16: ; %bb.0:
; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-TRUE16-LABEL: test_fmac:
; GFX12-GISEL-TRUE16: ; %bb.0:
; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v2.l
; GFX12-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-FAKE16-LABEL: test_fmac:
; GFX12-GISEL-FAKE16: ; %bb.0:
; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2
; GFX12-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
%r = call half @llvm.fma.f16(half %y, half %z, half %x)
ret half %r
}
@ -162,15 +226,45 @@ define half @test_fmaak(half %x, half %y, half %z) {
; GFX11-GISEL-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmaak:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-SDAG-TRUE16-LABEL: test_fmaak:
; GFX12-SDAG-TRUE16: ; %bb.0:
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v1.l, 0x4200
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_fmaak:
; GFX12-SDAG-FAKE16: ; %bb.0:
; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-TRUE16-LABEL: test_fmaak:
; GFX12-GISEL-TRUE16: ; %bb.0:
; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v1.l, 0x4200
; GFX12-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-FAKE16-LABEL: test_fmaak:
; GFX12-GISEL-FAKE16: ; %bb.0:
; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
; GFX12-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
%r = call half @llvm.fma.f16(half %x, half %y, half 0xH4200)
ret half %r
}
@ -223,15 +317,47 @@ define half @test_fmamk(half %x, half %y, half %z) {
; GFX11-GISEL-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmamk:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-SDAG-TRUE16-LABEL: test_fmamk:
; GFX12-SDAG-TRUE16: ; %bb.0:
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: v_fmamk_f16 v0.l, v0.l, 0x4200, v2.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_fmamk:
; GFX12-SDAG-FAKE16: ; %bb.0:
; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-TRUE16-LABEL: test_fmamk:
; GFX12-GISEL-TRUE16: ; %bb.0:
; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 0x4200, v0.l
; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX12-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-FAKE16-LABEL: test_fmamk:
; GFX12-GISEL-FAKE16: ; %bb.0:
; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
; GFX12-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
%r = call half @llvm.fma.f16(half %x, half 0xH4200, half %z)
ret half %r
}
@ -340,42 +466,79 @@ define i32 @test_D139469_f16(half %arg) {
; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: test_D139469_f16:
; GFX12-SDAG: ; %bb.0: ; %bb
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e
; GFX12-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
; GFX12-SDAG-NEXT: v_min_num_f16_e32 v0, v2, v1
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
; GFX12-SDAG-TRUE16-LABEL: test_D139469_f16:
; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e
; GFX12-SDAG-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l
; GFX12-SDAG-TRUE16-NEXT: v_min_num_f16_e32 v0.l, v1.l, v0.h
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: test_D139469_f16:
; GFX12-GISEL: ; %bb.0: ; %bb
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e
; GFX12-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
; GFX12-SDAG-FAKE16-LABEL: test_D139469_f16:
; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
; GFX12-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v2, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-TRUE16-LABEL: test_D139469_f16:
; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb
; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e
; GFX12-GISEL-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l
; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l
; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h
; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-FAKE16-LABEL: test_D139469_f16:
; GFX12-GISEL-FAKE16: ; %bb.0: ; %bb
; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
; GFX12-GISEL-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
; GFX12-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%i = fmul contract half %arg, 0xH291E
%i1 = fcmp olt half %i, 0xH0000
@ -525,55 +688,103 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: test_D139469_v2f16:
; GFX12-SDAG: ; %bb.0: ; %bb
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x211e
; GFX12-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_pk_min_num_f16 v0, v1, v0
; GFX12-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
; GFX12-SDAG-TRUE16-LABEL: test_D139469_v2f16:
; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x211e
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_pk_min_num_f16 v0, v1, v0
; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: test_D139469_v2f16:
; GFX12-GISEL: ; %bb.0: ; %bb
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e
; GFX12-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-GISEL-NEXT: s_or_b32 s0, s1, s2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
; GFX12-SDAG-FAKE16-LABEL: test_D139469_v2f16:
; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x211e
; GFX12-SDAG-FAKE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_pk_min_num_f16 v0, v1, v0
; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-TRUE16-LABEL: test_D139469_v2f16:
; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb
; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
; GFX12-GISEL-TRUE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2.l
; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v2.h
; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l
; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v0.h
; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s0, s1, s2
; GFX12-GISEL-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX12-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-FAKE16-LABEL: test_D139469_v2f16:
; GFX12-GISEL-FAKE16: ; %bb.0: ; %bb
; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
; GFX12-GISEL-FAKE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
; GFX12-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX12-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
; GFX12-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-GISEL-FAKE16-NEXT: s_or_b32 s0, s1, s2
; GFX12-GISEL-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX12-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%i = fmul contract <2 x half> %arg, <half 0xH291E, half 0xH291E>
%i1 = fcmp olt <2 x half> %i, <half 0xH0000, half 0xH0000>

View File

@ -8,8 +8,10 @@
; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s
define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 {
; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_f16:
@ -58,18 +60,31 @@ define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 {
; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_f16:
; GFX11-SAFE: ; %bb.0:
; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_f16:
; GFX11-NNAN: ; %bb.0:
; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NNAN-NEXT: v_max_f16_e32 v0, v0, v1
; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31]
; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_f16:
; GFX11-SAFE-FAKE16: ; %bb.0:
; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-TRUE16-LABEL: test_fmax_legacy_ugt_f16:
; GFX11-NNAN-TRUE16: ; %bb.0:
; GFX11-NNAN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NNAN-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l
; GFX11-NNAN-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-FAKE16-LABEL: test_fmax_legacy_ugt_f16:
; GFX11-NNAN-FAKE16: ; %bb.0:
; GFX11-NNAN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NNAN-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1
; GFX11-NNAN-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ugt half %a, %b
%val = select i1 %cmp, half %a, half %b
ret half %val
@ -146,18 +161,30 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v3
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
; GFX11-SAFE: ; %bb.0:
; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v2f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v0.l, v1.l
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3.l, v2.l
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v3.l, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v2f16:
; GFX11-SAFE-FAKE16: ; %bb.0:
; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
; GFX11-NNAN: ; %bb.0:
@ -256,20 +283,35 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
; SI-NNAN-NEXT: v_max_f32_e32 v2, v2, v5
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
; GFX11-SAFE: ; %bb.0:
; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v3f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.l, v2.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s1, v1.l, v3.l
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v5.l, v4.l
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s1
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v5.l, s0
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v3f16:
; GFX11-SAFE-FAKE16: ; %bb.0:
; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
; GFX11-NNAN: ; %bb.0:
@ -392,26 +434,45 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
; SI-NNAN-NEXT: v_max_f32_e32 v3, v3, v7
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
; GFX11-SAFE: ; %bb.0:
; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v7, v6
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SAFE-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX11-SAFE-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v4f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1.l, v3.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v0.l, v2.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s1, v5.l, v4.l
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, v7.l, v6.l
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s0
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v5.l, s1
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, v6.l, v7.l, s2
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v4f16:
; GFX11-SAFE-FAKE16: ; %bb.0:
; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v7, v6
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
; GFX11-NNAN: ; %bb.0:
@ -612,40 +673,69 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
; SI-NNAN-NEXT: v_max_f32_e32 v7, v7, v15
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
; GFX11-SAFE: ; %bb.0:
; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v7
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v3
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v5
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v1
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v11, v10
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v13, v12
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v15, v14
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v9, v8
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v2, v6
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v4
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-SAFE-NEXT: v_perm_b32 v2, v11, v2, 0x5040100
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v5
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v7
; GFX11-SAFE-NEXT: v_perm_b32 v1, v12, v1, 0x5040100
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
; GFX11-SAFE-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SAFE-NEXT: v_perm_b32 v3, v10, v3, 0x5040100
; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v8f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v5
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v6
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v2
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v7
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v3
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v9.l, v8.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v11.l, v10.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s1, v13.l, v12.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s3, v0.l, v4.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, v15.l, v14.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s4, v1.l, v5.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s5, v2.l, v6.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s6, v3.l, v7.l
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.h, v12.l, v13.l, s1
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.h, v14.l, v15.l, s2
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v11.l, s0
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v9.l, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s3
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s4
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.l, v6.l, v2.l, s5
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.l, v7.l, v3.l, s6
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v8f16:
; GFX11-SAFE-FAKE16: ; %bb.0:
; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v7
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v3
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v6
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v2
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v11, v10
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v13, v12
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v15, v14
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v9, v8
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v2, v6
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v4
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v2, v11, v2, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v5
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v7
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v1, v12, v1, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v3, v10, v3, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
; GFX11-NNAN: ; %bb.0:

View File

@ -8,8 +8,10 @@
; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s
define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 {
@ -59,18 +61,31 @@ define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 {
; SI-NNAN-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-LABEL: test_fmin_legacy_ule_f16:
; GFX11-SAFE: ; %bb.0:
; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmin_legacy_ule_f16:
; GFX11-NNAN: ; %bb.0:
; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NNAN-NEXT: v_min_f16_e32 v0, v0, v1
; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31]
; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_f16:
; GFX11-SAFE-FAKE16: ; %bb.0:
; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-TRUE16-LABEL: test_fmin_legacy_ule_f16:
; GFX11-NNAN-TRUE16: ; %bb.0:
; GFX11-NNAN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NNAN-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l
; GFX11-NNAN-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-FAKE16-LABEL: test_fmin_legacy_ule_f16:
; GFX11-NNAN-FAKE16: ; %bb.0:
; GFX11-NNAN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NNAN-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1
; GFX11-NNAN-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule half %a, %b
%val = select i1 %cmp, half %a, half %b
ret half %val
@ -147,18 +162,30 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
; SI-NNAN-NEXT: v_min_f32_e32 v1, v1, v3
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
; GFX11-SAFE: ; %bb.0:
; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v2f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v0.l, v1.l
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3.l, v2.l
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v3.l, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v2f16:
; GFX11-SAFE-FAKE16: ; %bb.0:
; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
; GFX11-NNAN: ; %bb.0:
@ -257,20 +284,35 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
; SI-NNAN-NEXT: v_min_f32_e32 v2, v2, v5
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
; GFX11-SAFE: ; %bb.0:
; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v3f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.l, v2.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, v1.l, v3.l
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v5.l, v4.l
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s1
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v5.l, s0
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v3f16:
; GFX11-SAFE-FAKE16: ; %bb.0:
; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
; GFX11-NNAN: ; %bb.0:
@ -393,26 +435,45 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
; SI-NNAN-NEXT: v_min_f32_e32 v3, v3, v7
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
; GFX11-SAFE: ; %bb.0:
; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SAFE-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX11-SAFE-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v4f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1.l, v3.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v0.l, v2.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, v5.l, v4.l
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s2, v7.l, v6.l
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s0
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v5.l, s1
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, v6.l, v7.l, s2
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v4f16:
; GFX11-SAFE-FAKE16: ; %bb.0:
; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
; GFX11-NNAN: ; %bb.0:
@ -613,40 +674,69 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
; SI-NNAN-NEXT: v_min_f32_e32 v7, v7, v15
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
; GFX11-SAFE: ; %bb.0:
; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v7
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v3
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v5
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v1
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v11, v10
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v13, v12
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v15, v14
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v9, v8
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v2, v6
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v4
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-SAFE-NEXT: v_perm_b32 v2, v11, v2, 0x5040100
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v5
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v7
; GFX11-SAFE-NEXT: v_perm_b32 v1, v12, v1, 0x5040100
; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
; GFX11-SAFE-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SAFE-NEXT: v_perm_b32 v3, v10, v3, 0x5040100
; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v8f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v5
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v6
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v2
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v7
; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v3
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v9.l, v8.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v11.l, v10.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, v13.l, v12.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s3, v0.l, v4.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s2, v15.l, v14.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s4, v1.l, v5.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s5, v2.l, v6.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s6, v3.l, v7.l
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.h, v12.l, v13.l, s1
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.h, v14.l, v15.l, s2
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v11.l, s0
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v9.l, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s3
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s4
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.l, v6.l, v2.l, s5
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.l, v7.l, v3.l, s6
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v8f16:
; GFX11-SAFE-FAKE16: ; %bb.0:
; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v7
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v3
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v6
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v2
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v11, v10
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v13, v12
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v15, v14
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v9, v8
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v2, v6
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v4
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v2, v11, v2, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v5
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v7
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v1, v12, v1, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v3, v10, v3, 0x5040100
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
; GFX11-NNAN: ; %bb.0:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,8 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=CIVI,CI %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=CIVI,VI %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) {
; CI-LABEL: fneg_fabs_fadd_f16:
@ -46,18 +47,33 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_fabs_fadd_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_sub_f16_e64 v1, s3, |s2|
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: fneg_fabs_fadd_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_sub_f16_e64 v0.l, v0.l, |v0.h|
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fneg_fabs_fadd_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_sub_f16_e64 v1, s3, |s2|
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call half @llvm.fabs.f16(half %x)
%fsub = fsub half -0.0, %fabs
%fadd = fadd half %y, %fsub
@ -108,18 +124,33 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_fabs_fmul_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mul_f16_e64 v1, s3, -|s2|
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: fneg_fabs_fmul_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -|v0.h|
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fneg_fabs_fmul_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mul_f16_e64 v1, s3, -|s2|
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call half @llvm.fabs.f16(half %x)
%fsub = fsub half -0.0, %fabs
%fmul = fmul half %y, %fsub
@ -166,17 +197,30 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_fabs_free_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_bitset1_b32 s2, 15
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: fneg_fabs_free_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fneg_fabs_free_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 15
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%bc = bitcast i16 %in to half
%fabs = call half @llvm.fabs.f16(half %bc)
%fsub = fsub half -0.0, %fabs
@ -220,17 +264,30 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_fabs_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_bitset1_b32 s2, 15
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: fneg_fabs_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fneg_fabs_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 15
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call half @llvm.fabs.f16(half %in)
%fsub = fsub half -0.0, %fabs
store half %fsub, ptr addrspace(1) %out, align 2
@ -263,16 +320,27 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_fneg_fabs_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_fneg_fabs_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_fneg_fabs_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%val = load half, ptr addrspace(1) %in, align 2
%fabs = call half @llvm.fabs.f16(half %val)
%fsub = fsub half -0.0, %fabs

View File

@ -2,7 +2,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,CI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,GFX8 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mattr=+real-true16 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
; FIXME: Should be able to do scalar op
define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
@ -41,17 +42,30 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: s_fneg_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: s_fneg_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%fneg = fsub half -0.0, %in
store half %fneg, ptr addrspace(1) %out
ret void
@ -99,18 +113,31 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_fneg_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_fneg_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_fneg_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.in = getelementptr inbounds half, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr inbounds half, ptr addrspace(1) %in, i32 %tid
@ -156,17 +183,30 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_free_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: s_fneg_free_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: s_fneg_free_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%bc = bitcast i16 %in to half
%fsub = fsub half -0.0, %bc
store half %fsub, ptr addrspace(1) %out
@ -216,16 +256,27 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_fneg_fold_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e64 v1, -v1, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_fneg_fold_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_fneg_fold_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_mul_f16_e64 v1, -v1, v1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%val = load half, ptr addrspace(1) %in
%fsub = fsub half -0.0, %val
%fmul = fmul half %fsub, %val
@ -572,22 +623,39 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_extract_fneg_fold_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_mul_f16_e32 v0, -4.0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_sub_f16_e32 v1, 2.0, v1
; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_extract_fneg_fold_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, -4.0, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v0.h, 2.0, v1.l
; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v0, off dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_extract_fneg_fold_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, -4.0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_sub_f16_e32 v1, 2.0, v1
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v0, off dlc
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v1, off dlc
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_endpgm
%val = load <2 x half>, ptr addrspace(1) %in
%fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
%elt0 = extractelement <2 x half> %fneg, i32 0

View File

@ -2,7 +2,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @fsub_f16(
; SI-LABEL: fsub_f16:
@ -54,29 +55,53 @@ define amdgpu_kernel void @fsub_f16(
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: s_mov_b32 s10, -1
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-NEXT: s_mov_b32 s14, s10
; GFX11-NEXT: s_mov_b32 s15, s11
; GFX11-NEXT: s_mov_b32 s6, s10
; GFX11-NEXT: s_mov_b32 s7, s11
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s2
; GFX11-NEXT: s_mov_b32 s13, s3
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s8, s0
; GFX11-NEXT: s_mov_b32 s9, s1
; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: fsub_f16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fsub_f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
; GFX11-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v1
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
@ -127,23 +152,41 @@ define amdgpu_kernel void @fsub_f16_imm_a(
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s6
; GFX11-NEXT: s_mov_b32 s11, s7
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s8, s2
; GFX11-NEXT: s_mov_b32 s9, s3
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s5, s1
; GFX11-NEXT: v_sub_f16_e32 v0, 1.0, v0
; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: fsub_f16_imm_a:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v0.l, 1.0, v0.l
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fsub_f16_imm_a:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
; GFX11-FAKE16-NEXT: v_sub_f16_e32 v0, 1.0, v0
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b) {
entry:
@ -192,23 +235,41 @@ define amdgpu_kernel void @fsub_f16_imm_b(
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s6
; GFX11-NEXT: s_mov_b32 s11, s7
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s8, s2
; GFX11-NEXT: s_mov_b32 s9, s3
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s5, s1
; GFX11-NEXT: v_add_f16_e32 v0, -2.0, v0
; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: fsub_f16_imm_b:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -2.0, v0.l
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fsub_f16_imm_b:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, -2.0, v0
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX11 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
@ -639,17 +640,31 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_0.0_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_0.0_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0, v0.l ; encoding: [0x80,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_0.0_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_0.0_f16:
; VI: ; %bb.0:
@ -693,17 +708,31 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_0.5_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_0.5_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0.5, v0.l ; encoding: [0xf0,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_0.5_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_0.5_f16:
; VI: ; %bb.0:
@ -747,17 +776,31 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_neg_0.5_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_neg_0.5_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -0.5, v0.l ; encoding: [0xf1,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_neg_0.5_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_neg_0.5_f16:
; VI: ; %bb.0:
@ -801,17 +844,31 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_1.0_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_1.0_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; encoding: [0xf2,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_1.0_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_1.0_f16:
; VI: ; %bb.0:
@ -855,17 +912,31 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_neg_1.0_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_neg_1.0_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -1.0, v0.l ; encoding: [0xf3,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_neg_1.0_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_neg_1.0_f16:
; VI: ; %bb.0:
@ -909,17 +980,31 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_2.0_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_2.0_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l ; encoding: [0xf4,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_2.0_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_2.0_f16:
; VI: ; %bb.0:
@ -963,17 +1048,31 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_neg_2.0_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_neg_2.0_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -2.0, v0.l ; encoding: [0xf5,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_neg_2.0_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_neg_2.0_f16:
; VI: ; %bb.0:
@ -1017,17 +1116,31 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_4.0_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_4.0_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 4.0, v0.l ; encoding: [0xf6,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_4.0_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_4.0_f16:
; VI: ; %bb.0:
@ -1071,17 +1184,31 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_neg_4.0_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_neg_4.0_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -4.0, v0.l ; encoding: [0xf7,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_neg_4.0_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_neg_4.0_f16:
; VI: ; %bb.0:
@ -1131,23 +1258,41 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out,
; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: commute_add_inline_imm_0.5_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
; GFX11-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
; GFX11-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
; GFX11-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
; GFX11-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e32 v0, 0.5, v0 ; encoding: [0xf0,0x00,0x00,0x64]
; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: commute_add_inline_imm_0.5_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0.5, v0.l ; encoding: [0xf0,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: commute_add_inline_imm_0.5_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 0.5, v0 ; encoding: [0xf0,0x00,0x00,0x64]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: commute_add_inline_imm_0.5_f16:
; VI: ; %bb.0:
@ -1211,23 +1356,41 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad
; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: commute_add_literal_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
; GFX11-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
; GFX11-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
; GFX11-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
; GFX11-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e32 v0, 0x6400, v0 ; encoding: [0xff,0x00,0x00,0x64,0x00,0x64,0x00,0x00]
; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: commute_add_literal_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0x6400, v0.l ; encoding: [0xff,0x00,0x00,0x64,0x00,0x64,0x00,0x00]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: commute_add_literal_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 0x6400, v0 ; encoding: [0xff,0x00,0x00,0x64,0x00,0x64,0x00,0x00]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: commute_add_literal_f16:
; VI: ; %bb.0:
@ -1285,17 +1448,31 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x)
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_1_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_1_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1, v0.l ; encoding: [0x81,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_1_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_1_f16:
; VI: ; %bb.0:
@ -1339,17 +1516,31 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x)
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_2_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_2_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 2, v0.l ; encoding: [0x82,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_2_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_2_f16:
; VI: ; %bb.0:
@ -1393,17 +1584,31 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x)
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_16_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_16_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 16, v0.l ; encoding: [0x90,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_16_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_16_f16:
; VI: ; %bb.0:
@ -1684,17 +1889,31 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x)
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_63_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_63_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 63, v0.l ; encoding: [0xbf,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_63_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_63_f16:
; VI: ; %bb.0:
@ -1738,17 +1957,31 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x)
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: add_inline_imm_64_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00]
; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
; GFX11-TRUE16-LABEL: add_inline_imm_64_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 64, v0.l ; encoding: [0xc0,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; GFX11-FAKE16-LABEL: add_inline_imm_64_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00]
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_64_f16:
; VI: ; %bb.0:
@ -1789,12 +2022,19 @@ define void @mul_inline_imm_0.5_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: mul_inline_imm_0.5_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: v_mul_lo_u16 v2, 0x3800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x38,0x00,0x00]
; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
; GFX11-TRUE16-LABEL: mul_inline_imm_0.5_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0x3800, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x38,0x00,0x00]
; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX11-FAKE16-LABEL: mul_inline_imm_0.5_i16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0x3800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x38,0x00,0x00]
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_0.5_i16:
; VI: ; %bb.0:
@ -1829,12 +2069,19 @@ define void @mul_inline_imm_neg_0.5_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: mul_inline_imm_neg_0.5_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: v_mul_lo_u16 v2, 0xb800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xb8,0xff,0xff]
; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
; GFX11-TRUE16-LABEL: mul_inline_imm_neg_0.5_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0xb800, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xb8,0xff,0xff]
; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX11-FAKE16-LABEL: mul_inline_imm_neg_0.5_i16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0xb800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xb8,0xff,0xff]
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_neg_0.5_i16:
; VI: ; %bb.0:
@ -1869,12 +2116,19 @@ define void @mul_inline_imm_1.0_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: mul_inline_imm_1.0_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: v_mul_lo_u16 v2, 0x3c00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x3c,0x00,0x00]
; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
; GFX11-TRUE16-LABEL: mul_inline_imm_1.0_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0x3c00, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x3c,0x00,0x00]
; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX11-FAKE16-LABEL: mul_inline_imm_1.0_i16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0x3c00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x3c,0x00,0x00]
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_1.0_i16:
; VI: ; %bb.0:
@ -1909,12 +2163,19 @@ define void @mul_inline_imm_neg_1.0_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: mul_inline_imm_neg_1.0_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: v_mul_lo_u16 v2, 0xbc00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xbc,0xff,0xff]
; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
; GFX11-TRUE16-LABEL: mul_inline_imm_neg_1.0_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0xbc00, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xbc,0xff,0xff]
; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX11-FAKE16-LABEL: mul_inline_imm_neg_1.0_i16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0xbc00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xbc,0xff,0xff]
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_neg_1.0_i16:
; VI: ; %bb.0:
@ -1949,12 +2210,19 @@ define void @shl_inline_imm_2.0_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: shl_inline_imm_2.0_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: v_lshlrev_b16 v2, v2, 0x4000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0x40,0x00,0x00]
; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
; GFX11-TRUE16-LABEL: shl_inline_imm_2.0_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, v2.l, 0x4000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0x40,0x00,0x00]
; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX11-FAKE16-LABEL: shl_inline_imm_2.0_i16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v2, v2, 0x4000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0x40,0x00,0x00]
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: shl_inline_imm_2.0_i16:
; VI: ; %bb.0:
@ -1989,12 +2257,19 @@ define void @shl_inline_imm_neg_2.0_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: shl_inline_imm_neg_2.0_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: v_lshlrev_b16 v2, v2, 0xc000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0xc0,0xff,0xff]
; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
; GFX11-TRUE16-LABEL: shl_inline_imm_neg_2.0_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, v2.l, 0xc000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0xc0,0xff,0xff]
; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX11-FAKE16-LABEL: shl_inline_imm_neg_2.0_i16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v2, v2, 0xc000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0xc0,0xff,0xff]
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: shl_inline_imm_neg_2.0_i16:
; VI: ; %bb.0:
@ -2029,12 +2304,19 @@ define void @mul_inline_imm_4.0_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: mul_inline_imm_4.0_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: v_mul_lo_u16 v2, 0x4400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x44,0x00,0x00]
; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
; GFX11-TRUE16-LABEL: mul_inline_imm_4.0_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0x4400, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x44,0x00,0x00]
; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX11-FAKE16-LABEL: mul_inline_imm_4.0_i16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0x4400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x44,0x00,0x00]
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_4.0_i16:
; VI: ; %bb.0:
@ -2069,12 +2351,19 @@ define void @mul_inline_imm_neg_4.0_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: mul_inline_imm_neg_4.0_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: v_mul_lo_u16 v2, 0xc400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xc4,0xff,0xff]
; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
; GFX11-TRUE16-LABEL: mul_inline_imm_neg_4.0_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0xc400, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xc4,0xff,0xff]
; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX11-FAKE16-LABEL: mul_inline_imm_neg_4.0_i16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0xc400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xc4,0xff,0xff]
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_neg_4.0_i16:
; VI: ; %bb.0:
@ -2109,12 +2398,19 @@ define void @mul_inline_imm_inv2pi_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
; GFX11-LABEL: mul_inline_imm_inv2pi_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: v_mul_lo_u16 v2, 0x3118, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x18,0x31,0x00,0x00]
; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
; GFX11-TRUE16-LABEL: mul_inline_imm_inv2pi_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0x3118, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x18,0x31,0x00,0x00]
; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; GFX11-FAKE16-LABEL: mul_inline_imm_inv2pi_i16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0x3118, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x18,0x31,0x00,0x00]
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_inv2pi_i16:
; VI: ; %bb.0:

File diff suppressed because it is too large Load Diff