Convert "denormal-fp-math" and "denormal-fp-math-f32" into a first class denormal_fpenv attribute. Previously the query for the effective denormal mode involved two string attribute queries with parsing. I'm introducing more uses of this, so it makes sense to convert this to a more efficient encoding. The old representation was also awkward since it was split across two separate attributes. The new encoding just stores the default and float modes as bitfields, largely avoiding the need to consider if the other mode is set. The syntax in the common cases looks like this: `denormal_fpenv(preservesign,preservesign)` `denormal_fpenv(float: preservesign,preservesign)` `denormal_fpenv(dynamic,dynamic float: preservesign,preservesign)` I wasn't sure about reusing the float type name instead of adding a new keyword. It's parsed as a type but only accepts float. I'm also debating switching the name to subnormal to match the current preferred IEEE terminology (also used by nofpclass and other contexts). This has a behavior change when using the command flag debug options to set the denormal mode. The behavior of the flag ignored functions with an explicit attribute set, per the default and f32 version. Now that these are one attribute, the flag logic can't distinguish which of the two components were explicitly set on the function. Only one test appeared to rely on this behavior, so I just avoided using the flags in it. This also does not perform all the code cleanups this enables. In particular the attributor handling could be cleaned up. I also guessed at how to support this in MLIR. I followed MemoryEffects as a reference; it appears bitfields are expanded into arguments to attributes, so the representation there is a bit uglier with the 2 2-element fields flattened into 4 arguments.
1020 lines
45 KiB
LLVM
1020 lines
45 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
|
|
|
|
define <2 x float> @v_repeat_divisor_f32_x2(float %x, float %y, float %D) #0 {
|
|
; GFX6-LABEL: v_repeat_divisor_f32_x2:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v4, v3
|
|
; GFX6-NEXT: v_fma_f32 v5, -v3, v4, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v4, v5, v4, v4
|
|
; GFX6-NEXT: v_div_scale_f32 v5, vcc, v0, v2, v0
|
|
; GFX6-NEXT: v_mul_f32_e32 v6, v5, v4
|
|
; GFX6-NEXT: v_fma_f32 v7, -v3, v6, v5
|
|
; GFX6-NEXT: v_fma_f32 v6, v7, v4, v6
|
|
; GFX6-NEXT: v_fma_f32 v3, -v3, v6, v5
|
|
; GFX6-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v1
|
|
; GFX6-NEXT: v_rcp_f32_e32 v7, v5
|
|
; GFX6-NEXT: v_div_fmas_f32 v3, v3, v4, v6
|
|
; GFX6-NEXT: v_div_fixup_f32 v0, v3, v2, v0
|
|
; GFX6-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1
|
|
; GFX6-NEXT: v_fma_f32 v3, -v5, v7, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v3, v3, v7, v7
|
|
; GFX6-NEXT: v_mul_f32_e32 v6, v4, v3
|
|
; GFX6-NEXT: v_fma_f32 v7, -v5, v6, v4
|
|
; GFX6-NEXT: v_fma_f32 v6, v7, v3, v6
|
|
; GFX6-NEXT: v_fma_f32 v4, -v5, v6, v4
|
|
; GFX6-NEXT: v_div_fmas_f32 v3, v4, v3, v6
|
|
; GFX6-NEXT: v_div_fixup_f32 v1, v3, v2, v1
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_repeat_divisor_f32_x2:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v0
|
|
; GFX9-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v1
|
|
; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v2, v0
|
|
; GFX9-NEXT: v_div_scale_f32 v6, s[4:5], v1, v2, v1
|
|
; GFX9-NEXT: v_rcp_f32_e32 v7, v3
|
|
; GFX9-NEXT: v_rcp_f32_e32 v8, v4
|
|
; GFX9-NEXT: v_fma_f32 v9, -v3, v7, 1.0
|
|
; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7
|
|
; GFX9-NEXT: v_fma_f32 v10, -v4, v8, 1.0
|
|
; GFX9-NEXT: v_fma_f32 v8, v10, v8, v8
|
|
; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7
|
|
; GFX9-NEXT: v_mul_f32_e32 v10, v6, v8
|
|
; GFX9-NEXT: v_fma_f32 v11, -v3, v9, v5
|
|
; GFX9-NEXT: v_fma_f32 v12, -v4, v10, v6
|
|
; GFX9-NEXT: v_fma_f32 v9, v11, v7, v9
|
|
; GFX9-NEXT: v_fma_f32 v10, v12, v8, v10
|
|
; GFX9-NEXT: v_fma_f32 v3, -v3, v9, v5
|
|
; GFX9-NEXT: v_fma_f32 v4, -v4, v10, v6
|
|
; GFX9-NEXT: v_div_fmas_f32 v3, v3, v7, v9
|
|
; GFX9-NEXT: s_mov_b64 vcc, s[4:5]
|
|
; GFX9-NEXT: v_div_fmas_f32 v4, v4, v8, v10
|
|
; GFX9-NEXT: v_div_fixup_f32 v0, v3, v2, v0
|
|
; GFX9-NEXT: v_div_fixup_f32 v1, v4, v2, v1
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_repeat_divisor_f32_x2:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_div_scale_f32 v3, null, v2, v2, v0
|
|
; GFX11-NEXT: v_div_scale_f32 v4, null, v2, v2, v1
|
|
; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v0, v2, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_rcp_f32_e32 v5, v3
|
|
; GFX11-NEXT: v_rcp_f32_e32 v6, v4
|
|
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-NEXT: v_fma_f32 v7, -v3, v5, 1.0
|
|
; GFX11-NEXT: v_fma_f32 v8, -v4, v6, 1.0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_dual_fmac_f32 v5, v7, v5 :: v_dual_fmac_f32 v6, v8, v6
|
|
; GFX11-NEXT: v_div_scale_f32 v7, s0, v1, v2, v1
|
|
; GFX11-NEXT: v_mul_f32_e32 v8, v9, v5
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_mul_f32_e32 v10, v7, v6
|
|
; GFX11-NEXT: v_fma_f32 v11, -v3, v8, v9
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_fma_f32 v12, -v4, v10, v7
|
|
; GFX11-NEXT: v_fmac_f32_e32 v8, v11, v5
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_fmac_f32_e32 v10, v12, v6
|
|
; GFX11-NEXT: v_fma_f32 v3, -v3, v8, v9
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_fma_f32 v4, -v4, v10, v7
|
|
; GFX11-NEXT: v_div_fmas_f32 v3, v3, v5, v8
|
|
; GFX11-NEXT: s_mov_b32 vcc_lo, s0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_div_fmas_f32 v4, v4, v6, v10
|
|
; GFX11-NEXT: v_div_fixup_f32 v0, v3, v2, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX11-NEXT: v_div_fixup_f32 v1, v4, v2, v1
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%div0 = fdiv float %x, %D
|
|
%div1 = fdiv float %y, %D
|
|
%insert.0 = insertelement <2 x float> poison, float %div0, i32 0
|
|
%insert.1 = insertelement <2 x float> %insert.0, float %div1, i32 1
|
|
ret <2 x float> %insert.1
|
|
}
|
|
|
|
define <2 x float> @v_repeat_divisor_f32_x2_arcp(float %x, float %y, float %D) #0 {
|
|
; GFX6-LABEL: v_repeat_divisor_f32_x2_arcp:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, 1.0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v4, v3
|
|
; GFX6-NEXT: v_fma_f32 v5, -v3, v4, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v4, v5, v4, v4
|
|
; GFX6-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
|
|
; GFX6-NEXT: v_mul_f32_e32 v6, v5, v4
|
|
; GFX6-NEXT: v_fma_f32 v7, -v3, v6, v5
|
|
; GFX6-NEXT: v_fma_f32 v6, v7, v4, v6
|
|
; GFX6-NEXT: v_fma_f32 v3, -v3, v6, v5
|
|
; GFX6-NEXT: v_div_fmas_f32 v3, v3, v4, v6
|
|
; GFX6-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0
|
|
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_repeat_divisor_f32_x2_arcp:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, 1.0
|
|
; GFX9-NEXT: v_div_scale_f32 v4, vcc, 1.0, v2, 1.0
|
|
; GFX9-NEXT: v_rcp_f32_e32 v5, v3
|
|
; GFX9-NEXT: v_fma_f32 v6, -v3, v5, 1.0
|
|
; GFX9-NEXT: v_fma_f32 v5, v6, v5, v5
|
|
; GFX9-NEXT: v_mul_f32_e32 v6, v4, v5
|
|
; GFX9-NEXT: v_fma_f32 v7, -v3, v6, v4
|
|
; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6
|
|
; GFX9-NEXT: v_fma_f32 v3, -v3, v6, v4
|
|
; GFX9-NEXT: v_div_fmas_f32 v3, v3, v5, v6
|
|
; GFX9-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0
|
|
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_repeat_divisor_f32_x2_arcp:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_div_scale_f32 v3, null, v2, v2, 1.0
|
|
; GFX11-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v2, 1.0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_rcp_f32_e32 v4, v3
|
|
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-NEXT: v_fma_f32 v5, -v3, v4, 1.0
|
|
; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v4
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_mul_f32_e32 v5, v6, v4
|
|
; GFX11-NEXT: v_fma_f32 v7, -v3, v5, v6
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_fmac_f32_e32 v5, v7, v4
|
|
; GFX11-NEXT: v_fma_f32 v3, -v3, v5, v6
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_div_fmas_f32 v3, v3, v4, v5
|
|
; GFX11-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%div0 = fdiv arcp float %x, %D
|
|
%div1 = fdiv arcp float %y, %D
|
|
%insert.0 = insertelement <2 x float> poison, float %div0, i32 0
|
|
%insert.1 = insertelement <2 x float> %insert.0, float %div1, i32 1
|
|
ret <2 x float> %insert.1
|
|
}
|
|
|
|
define <2 x float> @v_repeat_divisor_f32_x2_arcp_daz(float %x, float %y, float %D) #1 {
|
|
; GFX6-LABEL: v_repeat_divisor_f32_x2_arcp_daz:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, 1.0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v4, v3
|
|
; GFX6-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
|
|
; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
|
|
; GFX6-NEXT: v_fma_f32 v6, -v3, v4, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v4, v6, v4, v4
|
|
; GFX6-NEXT: v_mul_f32_e32 v6, v5, v4
|
|
; GFX6-NEXT: v_fma_f32 v7, -v3, v6, v5
|
|
; GFX6-NEXT: v_fma_f32 v6, v7, v4, v6
|
|
; GFX6-NEXT: v_fma_f32 v3, -v3, v6, v5
|
|
; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
|
|
; GFX6-NEXT: v_div_fmas_f32 v3, v3, v4, v6
|
|
; GFX6-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0
|
|
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_repeat_divisor_f32_x2_arcp_daz:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, 1.0
|
|
; GFX9-NEXT: v_div_scale_f32 v4, vcc, 1.0, v2, 1.0
|
|
; GFX9-NEXT: v_rcp_f32_e32 v5, v3
|
|
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
|
|
; GFX9-NEXT: v_fma_f32 v6, -v3, v5, 1.0
|
|
; GFX9-NEXT: v_fma_f32 v5, v6, v5, v5
|
|
; GFX9-NEXT: v_mul_f32_e32 v6, v4, v5
|
|
; GFX9-NEXT: v_fma_f32 v7, -v3, v6, v4
|
|
; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6
|
|
; GFX9-NEXT: v_fma_f32 v3, -v3, v6, v4
|
|
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
|
|
; GFX9-NEXT: v_div_fmas_f32 v3, v3, v5, v6
|
|
; GFX9-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0
|
|
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_repeat_divisor_f32_x2_arcp_daz:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_div_scale_f32 v3, null, v2, v2, 1.0
|
|
; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v2, 1.0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_rcp_f32_e32 v4, v3
|
|
; GFX11-NEXT: s_denorm_mode 15
|
|
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-NEXT: v_fma_f32 v6, -v3, v4, 1.0
|
|
; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v4
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_mul_f32_e32 v6, v5, v4
|
|
; GFX11-NEXT: v_fma_f32 v7, -v3, v6, v5
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v4
|
|
; GFX11-NEXT: v_fma_f32 v3, -v3, v6, v5
|
|
; GFX11-NEXT: s_denorm_mode 12
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_div_fmas_f32 v3, v3, v4, v6
|
|
; GFX11-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%div0 = fdiv arcp float %x, %D
|
|
%div1 = fdiv arcp float %y, %D
|
|
%insert.0 = insertelement <2 x float> poison, float %div0, i32 0
|
|
%insert.1 = insertelement <2 x float> %insert.0, float %div1, i32 1
|
|
ret <2 x float> %insert.1
|
|
}
|
|
|
|
define <2 x half> @v_repeat_divisor_f16_x2_arcp(half %x, half %y, half %D) #0 {
|
|
; GFX6-LABEL: v_repeat_divisor_f16_x2_arcp:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; GFX6-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, 1.0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v4, v3
|
|
; GFX6-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v6, -v3, v4, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v4, v6, v4, v4
|
|
; GFX6-NEXT: v_mul_f32_e32 v6, v5, v4
|
|
; GFX6-NEXT: v_fma_f32 v7, -v3, v6, v5
|
|
; GFX6-NEXT: v_fma_f32 v6, v7, v4, v6
|
|
; GFX6-NEXT: v_fma_f32 v3, -v3, v6, v5
|
|
; GFX6-NEXT: v_div_fmas_f32 v3, v3, v4, v6
|
|
; GFX6-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_repeat_divisor_f16_x2_arcp:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_rcp_f16_e32 v2, v2
|
|
; GFX9-NEXT: v_mul_f16_e32 v0, v0, v2
|
|
; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2
|
|
; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_repeat_divisor_f16_x2_arcp:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v2.l, v2.l
|
|
; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
|
|
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.h, v1.l, v2.l
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_repeat_divisor_f16_x2_arcp:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v2, v2
|
|
; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v2
|
|
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%div0 = fdiv arcp half %x, %D
|
|
%div1 = fdiv arcp half %y, %D
|
|
%insert.0 = insertelement <2 x half> poison, half %div0, i32 0
|
|
%insert.1 = insertelement <2 x half> %insert.0, half %div1, i32 1
|
|
ret <2 x half> %insert.1
|
|
}
|
|
|
|
define <2 x double> @v_repeat_divisor_f64_x2_arcp(double %x, double %y, double %D) #0 {
|
|
; GFX6-LABEL: v_repeat_divisor_f64_x2_arcp:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], 1.0
|
|
; GFX6-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
|
|
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
|
|
; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
|
|
; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
|
|
; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], 1.0, v[4:5], 1.0
|
|
; GFX6-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0
|
|
; GFX6-NEXT: s_mov_b32 s4, 0x3ff00000
|
|
; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
|
|
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v11
|
|
; GFX6-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
|
|
; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc
|
|
; GFX6-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
|
|
; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
|
|
; GFX6-NEXT: v_div_fixup_f64 v[4:5], v[6:7], v[4:5], 1.0
|
|
; GFX6-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
|
|
; GFX6-NEXT: v_mul_f64 v[2:3], v[2:3], v[4:5]
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_repeat_divisor_f64_x2_arcp:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], 1.0
|
|
; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
|
|
; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
|
|
; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
|
|
; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[4:5], 1.0
|
|
; GFX9-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0
|
|
; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
|
|
; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
|
|
; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
|
|
; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
|
|
; GFX9-NEXT: v_div_fixup_f64 v[4:5], v[6:7], v[4:5], 1.0
|
|
; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
|
|
; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], v[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_repeat_divisor_f64_x2_arcp:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], 1.0
|
|
; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc_lo, 1.0, v[4:5], 1.0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
|
|
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
|
|
; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
|
|
; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9]
|
|
; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13]
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11]
|
|
; GFX11-NEXT: v_div_fixup_f64 v[4:5], v[6:7], v[4:5], 1.0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
|
|
; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], v[4:5]
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%div0 = fdiv arcp double %x, %D
|
|
%div1 = fdiv arcp double %y, %D
|
|
%insert.0 = insertelement <2 x double> poison, double %div0, i32 0
|
|
%insert.1 = insertelement <2 x double> %insert.0, double %div1, i32 1
|
|
ret <2 x double> %insert.1
|
|
}
|
|
|
|
define <3 x float> @v_repeat_divisor_f32_x3_arcp(float %x, float %y, float %z, float %D) #0 {
|
|
; GFX6-LABEL: v_repeat_divisor_f32_x3_arcp:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, 1.0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v5, v4
|
|
; GFX6-NEXT: v_fma_f32 v6, -v4, v5, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v5, v6, v5, v5
|
|
; GFX6-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
|
|
; GFX6-NEXT: v_mul_f32_e32 v7, v6, v5
|
|
; GFX6-NEXT: v_fma_f32 v8, -v4, v7, v6
|
|
; GFX6-NEXT: v_fma_f32 v7, v8, v5, v7
|
|
; GFX6-NEXT: v_fma_f32 v4, -v4, v7, v6
|
|
; GFX6-NEXT: v_div_fmas_f32 v4, v4, v5, v7
|
|
; GFX6-NEXT: v_div_fixup_f32 v3, v4, v3, 1.0
|
|
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3
|
|
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
|
|
; GFX6-NEXT: v_mul_f32_e32 v2, v2, v3
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_repeat_divisor_f32_x3_arcp:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, 1.0
|
|
; GFX9-NEXT: v_div_scale_f32 v5, vcc, 1.0, v3, 1.0
|
|
; GFX9-NEXT: v_rcp_f32_e32 v6, v4
|
|
; GFX9-NEXT: v_fma_f32 v7, -v4, v6, 1.0
|
|
; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6
|
|
; GFX9-NEXT: v_mul_f32_e32 v7, v5, v6
|
|
; GFX9-NEXT: v_fma_f32 v8, -v4, v7, v5
|
|
; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7
|
|
; GFX9-NEXT: v_fma_f32 v4, -v4, v7, v5
|
|
; GFX9-NEXT: v_div_fmas_f32 v4, v4, v6, v7
|
|
; GFX9-NEXT: v_div_fixup_f32 v3, v4, v3, 1.0
|
|
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v3
|
|
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
|
|
; GFX9-NEXT: v_mul_f32_e32 v2, v2, v3
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_repeat_divisor_f32_x3_arcp:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_div_scale_f32 v4, null, v3, v3, 1.0
|
|
; GFX11-NEXT: v_div_scale_f32 v7, vcc_lo, 1.0, v3, 1.0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_rcp_f32_e32 v5, v4
|
|
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-NEXT: v_fma_f32 v6, -v4, v5, 1.0
|
|
; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v5
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_mul_f32_e32 v6, v7, v5
|
|
; GFX11-NEXT: v_fma_f32 v8, -v4, v6, v7
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_fmac_f32_e32 v6, v8, v5
|
|
; GFX11-NEXT: v_fma_f32 v4, -v4, v6, v7
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_div_fmas_f32 v4, v4, v5, v6
|
|
; GFX11-NEXT: v_div_fixup_f32 v3, v4, v3, 1.0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_mul_f32_e32 v0, v0, v3
|
|
; GFX11-NEXT: v_mul_f32_e32 v1, v1, v3
|
|
; GFX11-NEXT: v_mul_f32_e32 v2, v2, v3
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%div0 = fdiv arcp float %x, %D
|
|
%div1 = fdiv arcp float %y, %D
|
|
%div2 = fdiv arcp float %z, %D
|
|
%insert.0 = insertelement <3 x float> poison, float %div0, i32 0
|
|
%insert.1 = insertelement <3 x float> %insert.0, float %div1, i32 1
|
|
%insert.2 = insertelement <3 x float> %insert.1, float %div2, i32 2
|
|
ret <3 x float> %insert.2
|
|
}
|
|
|
|
define <4 x float> @v_repeat_divisor_f32_x4_arcp(float %x, float %y, float %z, float %w, float %D) #0 {
|
|
; GFX6-LABEL: v_repeat_divisor_f32_x4_arcp:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_div_scale_f32 v5, s[4:5], v4, v4, 1.0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v6, v5
|
|
; GFX6-NEXT: v_fma_f32 v7, -v5, v6, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v6, v7, v6, v6
|
|
; GFX6-NEXT: v_div_scale_f32 v7, vcc, 1.0, v4, 1.0
|
|
; GFX6-NEXT: v_mul_f32_e32 v8, v7, v6
|
|
; GFX6-NEXT: v_fma_f32 v9, -v5, v8, v7
|
|
; GFX6-NEXT: v_fma_f32 v8, v9, v6, v8
|
|
; GFX6-NEXT: v_fma_f32 v5, -v5, v8, v7
|
|
; GFX6-NEXT: v_div_fmas_f32 v5, v5, v6, v8
|
|
; GFX6-NEXT: v_div_fixup_f32 v4, v5, v4, 1.0
|
|
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v4
|
|
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v4
|
|
; GFX6-NEXT: v_mul_f32_e32 v2, v2, v4
|
|
; GFX6-NEXT: v_mul_f32_e32 v3, v3, v4
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_repeat_divisor_f32_x4_arcp:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_div_scale_f32 v5, s[4:5], v4, v4, 1.0
|
|
; GFX9-NEXT: v_div_scale_f32 v6, vcc, 1.0, v4, 1.0
|
|
; GFX9-NEXT: v_rcp_f32_e32 v7, v5
|
|
; GFX9-NEXT: v_fma_f32 v8, -v5, v7, 1.0
|
|
; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7
|
|
; GFX9-NEXT: v_mul_f32_e32 v8, v6, v7
|
|
; GFX9-NEXT: v_fma_f32 v9, -v5, v8, v6
|
|
; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8
|
|
; GFX9-NEXT: v_fma_f32 v5, -v5, v8, v6
|
|
; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8
|
|
; GFX9-NEXT: v_div_fixup_f32 v4, v5, v4, 1.0
|
|
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
|
|
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v4
|
|
; GFX9-NEXT: v_mul_f32_e32 v2, v2, v4
|
|
; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_repeat_divisor_f32_x4_arcp:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_div_scale_f32 v5, null, v4, v4, 1.0
|
|
; GFX11-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v4, 1.0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_rcp_f32_e32 v6, v5
|
|
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-NEXT: v_fma_f32 v7, -v5, v6, 1.0
|
|
; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v6
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_mul_f32_e32 v7, v8, v6
|
|
; GFX11-NEXT: v_fma_f32 v9, -v5, v7, v8
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v6
|
|
; GFX11-NEXT: v_fma_f32 v5, -v5, v7, v8
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_div_fmas_f32 v5, v5, v6, v7
|
|
; GFX11-NEXT: v_div_fixup_f32 v4, v5, v4, 1.0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
|
|
; GFX11-NEXT: v_mul_f32_e32 v1, v1, v4
|
|
; GFX11-NEXT: v_mul_f32_e32 v2, v2, v4
|
|
; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%div0 = fdiv arcp float %x, %D
|
|
%div1 = fdiv arcp float %y, %D
|
|
%div2 = fdiv arcp float %z, %D
|
|
%div3 = fdiv arcp float %w, %D
|
|
%insert.0 = insertelement <4 x float> poison, float %div0, i32 0
|
|
%insert.1 = insertelement <4 x float> %insert.0, float %div1, i32 1
|
|
%insert.2 = insertelement <4 x float> %insert.1, float %div2, i32 2
|
|
%insert.3 = insertelement <4 x float> %insert.2, float %div3, i32 3
|
|
ret <4 x float> %insert.3
|
|
}
|
|
|
|
define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half %D) #0 {
|
|
; GFX6-LABEL: v_repeat_divisor_f16_x3_arcp:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; GFX6-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, 1.0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v5, v4
|
|
; GFX6-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v7, -v4, v5, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v5, v7, v5, v5
|
|
; GFX6-NEXT: v_mul_f32_e32 v7, v6, v5
|
|
; GFX6-NEXT: v_fma_f32 v8, -v4, v7, v6
|
|
; GFX6-NEXT: v_fma_f32 v7, v8, v5, v7
|
|
; GFX6-NEXT: v_fma_f32 v4, -v4, v7, v6
|
|
; GFX6-NEXT: v_div_fmas_f32 v4, v4, v5, v7
|
|
; GFX6-NEXT: v_div_fixup_f32 v3, v4, v3, 1.0
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
|
|
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v1
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; GFX6-NEXT: v_mul_f32_e32 v1, v2, v3
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4
|
|
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_repeat_divisor_f16_x3_arcp:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_rcp_f16_e32 v3, v3
|
|
; GFX9-NEXT: v_mul_f16_e32 v0, v0, v3
|
|
; GFX9-NEXT: v_mul_f16_e32 v4, v1, v3
|
|
; GFX9-NEXT: v_mul_f16_e32 v1, v2, v3
|
|
; GFX9-NEXT: v_pack_b32_f16 v0, v0, v4
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_repeat_divisor_f16_x3_arcp:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v3.l, v3.l
|
|
; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v3.l
|
|
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.h, v1.l, v3.l
|
|
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v1.l, v2.l, v3.l
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_repeat_divisor_f16_x3_arcp:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v3, v3
|
|
; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v3
|
|
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v3
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
|
|
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, v2, v3
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%div0 = fdiv arcp half %x, %D
|
|
%div1 = fdiv arcp half %y, %D
|
|
%div2 = fdiv arcp half %z, %D
|
|
%insert.0 = insertelement <3 x half> poison, half %div0, i32 0
|
|
%insert.1 = insertelement <3 x half> %insert.0, half %div1, i32 1
|
|
%insert.2 = insertelement <3 x half> %insert.1, half %div2, i32 2
|
|
ret <3 x half> %insert.2
|
|
}
|
|
|
|
define <4 x float> @v_repeat_divisor_v2f32_x2(<2 x float> %x, <2 x float> %y, <2 x float> %D) #0 {
|
|
; GFX6-LABEL: v_repeat_divisor_v2f32_x2:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, 1.0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v7, v6
|
|
; GFX6-NEXT: v_fma_f32 v8, -v6, v7, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v7, v8, v7, v7
|
|
; GFX6-NEXT: v_div_scale_f32 v8, vcc, 1.0, v4, 1.0
|
|
; GFX6-NEXT: v_mul_f32_e32 v9, v8, v7
|
|
; GFX6-NEXT: v_fma_f32 v10, -v6, v9, v8
|
|
; GFX6-NEXT: v_fma_f32 v9, v10, v7, v9
|
|
; GFX6-NEXT: v_fma_f32 v6, -v6, v9, v8
|
|
; GFX6-NEXT: v_div_scale_f32 v8, s[4:5], v5, v5, 1.0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v10, v8
|
|
; GFX6-NEXT: v_div_fmas_f32 v6, v6, v7, v9
|
|
; GFX6-NEXT: v_div_fixup_f32 v4, v6, v4, 1.0
|
|
; GFX6-NEXT: v_div_scale_f32 v7, vcc, 1.0, v5, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v6, -v8, v10, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v6, v6, v10, v10
|
|
; GFX6-NEXT: v_mul_f32_e32 v9, v7, v6
|
|
; GFX6-NEXT: v_fma_f32 v10, -v8, v9, v7
|
|
; GFX6-NEXT: v_fma_f32 v9, v10, v6, v9
|
|
; GFX6-NEXT: v_fma_f32 v7, -v8, v9, v7
|
|
; GFX6-NEXT: v_div_fmas_f32 v6, v7, v6, v9
|
|
; GFX6-NEXT: v_div_fixup_f32 v5, v6, v5, 1.0
|
|
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v4
|
|
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v5
|
|
; GFX6-NEXT: v_mul_f32_e32 v2, v2, v4
|
|
; GFX6-NEXT: v_mul_f32_e32 v3, v3, v5
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_repeat_divisor_v2f32_x2:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, 1.0
|
|
; GFX9-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, 1.0
|
|
; GFX9-NEXT: v_div_scale_f32 v8, vcc, 1.0, v4, 1.0
|
|
; GFX9-NEXT: v_div_scale_f32 v9, s[4:5], 1.0, v5, 1.0
|
|
; GFX9-NEXT: v_rcp_f32_e32 v10, v6
|
|
; GFX9-NEXT: v_rcp_f32_e32 v11, v7
|
|
; GFX9-NEXT: v_fma_f32 v12, -v6, v10, 1.0
|
|
; GFX9-NEXT: v_fma_f32 v10, v12, v10, v10
|
|
; GFX9-NEXT: v_fma_f32 v13, -v7, v11, 1.0
|
|
; GFX9-NEXT: v_fma_f32 v11, v13, v11, v11
|
|
; GFX9-NEXT: v_mul_f32_e32 v12, v8, v10
|
|
; GFX9-NEXT: v_mul_f32_e32 v13, v9, v11
|
|
; GFX9-NEXT: v_fma_f32 v14, -v6, v12, v8
|
|
; GFX9-NEXT: v_fma_f32 v15, -v7, v13, v9
|
|
; GFX9-NEXT: v_fma_f32 v12, v14, v10, v12
|
|
; GFX9-NEXT: v_fma_f32 v6, -v6, v12, v8
|
|
; GFX9-NEXT: v_fma_f32 v8, v15, v11, v13
|
|
; GFX9-NEXT: v_div_fmas_f32 v6, v6, v10, v12
|
|
; GFX9-NEXT: v_fma_f32 v7, -v7, v8, v9
|
|
; GFX9-NEXT: s_mov_b64 vcc, s[4:5]
|
|
; GFX9-NEXT: v_div_fmas_f32 v7, v7, v11, v8
|
|
; GFX9-NEXT: v_div_fixup_f32 v4, v6, v4, 1.0
|
|
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
|
|
; GFX9-NEXT: v_mul_f32_e32 v2, v2, v4
|
|
; GFX9-NEXT: v_div_fixup_f32 v5, v7, v5, 1.0
|
|
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
|
|
; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_repeat_divisor_v2f32_x2:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_div_scale_f32 v6, null, v4, v4, 1.0
|
|
; GFX11-NEXT: v_div_scale_f32 v7, null, v5, v5, 1.0
|
|
; GFX11-NEXT: v_div_scale_f32 v12, vcc_lo, 1.0, v4, 1.0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_rcp_f32_e32 v8, v6
|
|
; GFX11-NEXT: v_rcp_f32_e32 v9, v7
|
|
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-NEXT: v_fma_f32 v10, -v6, v8, 1.0
|
|
; GFX11-NEXT: v_fma_f32 v11, -v7, v9, 1.0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_dual_fmac_f32 v8, v10, v8 :: v_dual_fmac_f32 v9, v11, v9
|
|
; GFX11-NEXT: v_div_scale_f32 v10, s0, 1.0, v5, 1.0
|
|
; GFX11-NEXT: v_mul_f32_e32 v11, v12, v8
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_fma_f32 v14, -v6, v11, v12
|
|
; GFX11-NEXT: v_fmac_f32_e32 v11, v14, v8
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_fma_f32 v6, -v6, v11, v12
|
|
; GFX11-NEXT: v_div_fmas_f32 v6, v6, v8, v11
|
|
; GFX11-NEXT: s_mov_b32 vcc_lo, s0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_div_fixup_f32 v4, v6, v4, 1.0
|
|
; GFX11-NEXT: v_dual_mul_f32 v13, v10, v9 :: v_dual_mul_f32 v0, v0, v4
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_fma_f32 v15, -v7, v13, v10
|
|
; GFX11-NEXT: v_dual_mul_f32 v2, v2, v4 :: v_dual_fmac_f32 v13, v15, v9
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_fma_f32 v7, -v7, v13, v10
|
|
; GFX11-NEXT: v_div_fmas_f32 v7, v7, v9, v13
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_div_fixup_f32 v5, v7, v5, 1.0
|
|
; GFX11-NEXT: v_mul_f32_e32 v1, v1, v5
|
|
; GFX11-NEXT: v_mul_f32_e32 v3, v3, v5
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%div0 = fdiv arcp <2 x float> %x, %D
|
|
%div1 = fdiv arcp <2 x float> %y, %D
|
|
%shuffle = shufflevector <2 x float> %div0, <2 x float> %div1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
ret <4 x float> %shuffle
|
|
}
|
|
|
|
define <2 x float> @v_repeat_divisor_f32_x2_ulp25(float %x, float %y, float %D) #0 {
|
|
; GFX6-LABEL: v_repeat_divisor_f32_x2_ulp25:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: s_mov_b32 s4, 0x7f800000
|
|
; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v2
|
|
; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s4
|
|
; GFX6-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc
|
|
; GFX6-NEXT: v_rcp_f32_e32 v3, v3
|
|
; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v2, v2
|
|
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v2
|
|
; GFX6-NEXT: v_ldexp_f32_e32 v2, v3, v2
|
|
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_repeat_divisor_f32_x2_ulp25:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v2
|
|
; GFX9-NEXT: v_rcp_f32_e32 v3, v3
|
|
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v2, v2
|
|
; GFX9-NEXT: v_sub_u32_e32 v2, 0, v2
|
|
; GFX9-NEXT: v_ldexp_f32 v2, v3, v2
|
|
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_repeat_divisor_f32_x2_ulp25:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v2
|
|
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v2, v2
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
|
|
; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v2
|
|
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-NEXT: v_ldexp_f32 v2, v3, v2
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%div0 = fdiv arcp float %x, %D, !fpmath !0
|
|
%div1 = fdiv arcp float %y, %D, !fpmath !0
|
|
%insert.0 = insertelement <2 x float> poison, float %div0, i32 0
|
|
%insert.1 = insertelement <2 x float> %insert.0, float %div1, i32 1
|
|
ret <2 x float> %insert.1
|
|
}
|
|
|
|
define <2 x float> @v_repeat_divisor_f32_x2_daz_ulp25(float %x, float %y, float %D) #1 {
|
|
; GFX6-LABEL: v_repeat_divisor_f32_x2_daz_ulp25:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_rcp_f32_e32 v2, v2
|
|
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_repeat_divisor_f32_x2_daz_ulp25:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_rcp_f32_e32 v2, v2
|
|
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_repeat_divisor_f32_x2_daz_ulp25:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
|
|
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%div0 = fdiv arcp float %x, %D, !fpmath !0
|
|
%div1 = fdiv arcp float %y, %D, !fpmath !0
|
|
%insert.0 = insertelement <2 x float> poison, float %div0, i32 0
|
|
%insert.1 = insertelement <2 x float> %insert.0, float %div1, i32 1
|
|
ret <2 x float> %insert.1
|
|
}
|
|
|
|
define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x half> %D) #0 {
|
|
; GFX6-LABEL: v_repeat_divisor_v2f16_x2:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6
|
|
; GFX6-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, 1.0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v5, v4
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; GFX6-NEXT: v_fma_f32 v7, -v4, v5, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v5, v7, v5, v5
|
|
; GFX6-NEXT: v_div_scale_f32 v7, vcc, 1.0, v3, 1.0
|
|
; GFX6-NEXT: v_mul_f32_e32 v8, v7, v5
|
|
; GFX6-NEXT: v_fma_f32 v9, -v4, v8, v7
|
|
; GFX6-NEXT: v_fma_f32 v8, v9, v5, v8
|
|
; GFX6-NEXT: v_fma_f32 v4, -v4, v8, v7
|
|
; GFX6-NEXT: v_div_scale_f32 v7, s[4:5], v2, v2, 1.0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v9, v7
|
|
; GFX6-NEXT: v_div_fmas_f32 v4, v4, v5, v8
|
|
; GFX6-NEXT: v_div_fixup_f32 v3, v4, v3, 1.0
|
|
; GFX6-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v4, -v7, v9, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v4, v4, v9, v9
|
|
; GFX6-NEXT: v_mul_f32_e32 v8, v5, v4
|
|
; GFX6-NEXT: v_fma_f32 v9, -v7, v8, v5
|
|
; GFX6-NEXT: v_fma_f32 v8, v9, v4, v8
|
|
; GFX6-NEXT: v_fma_f32 v5, -v7, v8, v5
|
|
; GFX6-NEXT: v_div_fmas_f32 v4, v5, v4, v8
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; GFX6-NEXT: v_div_fixup_f32 v2, v4, v2, 1.0
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
|
|
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; GFX6-NEXT: v_mul_f32_e32 v5, v6, v3
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
|
|
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
|
|
; GFX6-NEXT: v_mul_f32_e32 v3, v4, v3
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5
|
|
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
|
|
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_repeat_divisor_v2f16_x2:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_rcp_f16_sdwa v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
|
; GFX9-NEXT: v_rcp_f16_e32 v2, v2
|
|
; GFX9-NEXT: v_pack_b32_f16 v2, v2, v3
|
|
; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
|
|
; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_repeat_divisor_v2f16_x2:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v2.h, v2.h
|
|
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v2.l, v2.l
|
|
; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-TRUE16-NEXT: v_pk_mul_f16 v0, v0, v2
|
|
; GFX11-TRUE16-NEXT: v_pk_mul_f16 v1, v1, v2
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_repeat_divisor_v2f16_x2:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
|
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v2, v2
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v3, v3
|
|
; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v2, v3
|
|
; GFX11-FAKE16-NEXT: v_pk_mul_f16 v0, v0, v2
|
|
; GFX11-FAKE16-NEXT: v_pk_mul_f16 v1, v1, v2
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%div0 = fdiv arcp <2 x half> %x, %D
|
|
%div1 = fdiv arcp <2 x half> %y, %D
|
|
%shuffle = shufflevector <2 x half> %div0, <2 x half> %div1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
ret <4 x half> %shuffle
|
|
}
|
|
|
|
define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x half> %D) #0 {
|
|
; GFX6-LABEL: v_repeat_divisor_v3f16_x2:
|
|
; GFX6: ; %bb.0:
|
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; GFX6-NEXT: v_div_scale_f32 v7, s[4:5], v6, v6, 1.0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v8, v7
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; GFX6-NEXT: v_fma_f32 v9, -v7, v8, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v8, v9, v8, v8
|
|
; GFX6-NEXT: v_div_scale_f32 v9, vcc, 1.0, v6, 1.0
|
|
; GFX6-NEXT: v_mul_f32_e32 v10, v9, v8
|
|
; GFX6-NEXT: v_fma_f32 v11, -v7, v10, v9
|
|
; GFX6-NEXT: v_fma_f32 v10, v11, v8, v10
|
|
; GFX6-NEXT: v_fma_f32 v7, -v7, v10, v9
|
|
; GFX6-NEXT: v_div_fmas_f32 v7, v7, v8, v10
|
|
; GFX6-NEXT: v_div_scale_f32 v8, s[4:5], v4, v4, 1.0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v9, v8
|
|
; GFX6-NEXT: v_div_fixup_f32 v6, v7, v6, 1.0
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
|
|
; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v0
|
|
; GFX6-NEXT: v_fma_f32 v10, -v8, v9, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v9, v10, v9, v9
|
|
; GFX6-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0
|
|
; GFX6-NEXT: v_mul_f32_e32 v11, v10, v9
|
|
; GFX6-NEXT: v_fma_f32 v12, -v8, v11, v10
|
|
; GFX6-NEXT: v_fma_f32 v11, v12, v9, v11
|
|
; GFX6-NEXT: v_fma_f32 v8, -v8, v11, v10
|
|
; GFX6-NEXT: v_div_scale_f32 v10, s[4:5], v5, v5, 1.0
|
|
; GFX6-NEXT: v_rcp_f32_e32 v12, v10
|
|
; GFX6-NEXT: v_div_fmas_f32 v8, v8, v9, v11
|
|
; GFX6-NEXT: v_div_fixup_f32 v4, v8, v4, 1.0
|
|
; GFX6-NEXT: v_div_scale_f32 v9, vcc, 1.0, v5, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v8, -v10, v12, 1.0
|
|
; GFX6-NEXT: v_fma_f32 v8, v8, v12, v12
|
|
; GFX6-NEXT: v_mul_f32_e32 v11, v9, v8
|
|
; GFX6-NEXT: v_fma_f32 v12, -v10, v11, v9
|
|
; GFX6-NEXT: v_fma_f32 v11, v12, v8, v11
|
|
; GFX6-NEXT: v_fma_f32 v9, -v10, v11, v9
|
|
; GFX6-NEXT: v_div_fmas_f32 v8, v9, v8, v11
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
|
|
; GFX6-NEXT: v_div_fixup_f32 v5, v8, v5, 1.0
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
|
|
; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8
|
|
; GFX6-NEXT: v_mul_f32_e32 v2, v2, v4
|
|
; GFX6-NEXT: v_mul_f32_e32 v7, v7, v6
|
|
; GFX6-NEXT: v_mul_f32_e32 v1, v1, v5
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
|
|
; GFX6-NEXT: v_mul_f32_e32 v3, v3, v5
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
|
|
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v4
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; GFX6-NEXT: v_mul_f32_e32 v6, v8, v6
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6
|
|
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
|
; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
|
|
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
|
|
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
|
|
; GFX6-NEXT: v_or_b32_e32 v0, v0, v7
|
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_repeat_divisor_v3f16_x2:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_rcp_f16_sdwa v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
|
; GFX9-NEXT: v_rcp_f16_e32 v4, v4
|
|
; GFX9-NEXT: v_rcp_f16_e32 v5, v5
|
|
; GFX9-NEXT: s_movk_i32 s4, 0x7e00
|
|
; GFX9-NEXT: v_pack_b32_f16 v4, v4, v6
|
|
; GFX9-NEXT: v_pack_b32_f16 v5, v5, s4
|
|
; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4
|
|
; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5
|
|
; GFX9-NEXT: v_pk_mul_f16 v3, v3, v5
|
|
; GFX9-NEXT: v_pk_mul_f16 v4, v2, v4
|
|
; GFX9-NEXT: v_alignbit_b32 v2, v3, v4, 16
|
|
; GFX9-NEXT: v_pack_b32_f16 v1, v1, v4
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_repeat_divisor_v3f16_x2:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v4.h, v4.h
|
|
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v4.l, v4.l
|
|
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v5.l, v5.l
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0x7e00
|
|
; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-TRUE16-NEXT: v_pk_mul_f16 v2, v2, v4
|
|
; GFX11-TRUE16-NEXT: v_pk_mul_f16 v3, v3, v5
|
|
; GFX11-TRUE16-NEXT: v_pk_mul_f16 v1, v1, v5
|
|
; GFX11-TRUE16-NEXT: v_pk_mul_f16 v0, v0, v4
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_repeat_divisor_v3f16_x2:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
|
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v4, v4
|
|
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v5, v5
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
|
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v6, v6
|
|
; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
|
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v5, v5, 0x7e00
|
|
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v4, v4, v6
|
|
; GFX11-FAKE16-NEXT: v_pk_mul_f16 v1, v1, v5
|
|
; GFX11-FAKE16-NEXT: v_pk_mul_f16 v3, v3, v5
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-FAKE16-NEXT: v_pk_mul_f16 v2, v2, v4
|
|
; GFX11-FAKE16-NEXT: v_pk_mul_f16 v0, v0, v4
|
|
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, v3, v2, 16
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%div0 = fdiv arcp <3 x half> %x, %D
|
|
%div1 = fdiv arcp <3 x half> %y, %D
|
|
%shuffle = shufflevector <3 x half> %div0, <3 x half> %div1, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
|
|
ret <6 x half> %shuffle
|
|
}
|
|
|
|
attributes #0 = { denormal_fpenv(float: ieee|ieee) }
|
|
attributes #1 = { denormal_fpenv(float: preservesign) }
|
|
|
|
!0 = !{float 2.5}
|
|
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
|
; GCN: {{.*}}
|