[AMDGPU] Fix and simplify patterns selecting fsub to v_fma_mix_f32 (#180169)

Select (fsub x, y) -> (fma y, -1.0, x). Using -1.0 as the constant
avoids the need for ComplexPatterns to negate x or y.

This also fixes the bad pattern (fsub x, y) -> (fma -x, 1.0, y).
This commit is contained in:
Jay Foad 2026-02-06 14:39:13 +00:00 committed by GitHub
parent 5283f46615
commit 4a6697f393
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 96 additions and 135 deletions

View File

@ -205,10 +205,6 @@ def gi_vop3_mad_mix_mods_ext :
GIComplexOperandMatcher<s64, "selectVOP3PMadMixModsExt">,
GIComplexPatternEquiv<VOP3PMadMixModsExt>;
def gi_vop3_mad_mix_mods_neg :
GIComplexOperandMatcher<s64, "selectVOP3PMadMixModsNeg">,
GIComplexPatternEquiv<VOP3PMadMixModsNeg>;
// Separate load nodes are defined to glue m0 initialization in
// SelectionDAG. The GISel selector can just insert m0 initialization
// directly before selecting a glue-less load, so hide this

View File

@ -4204,24 +4204,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
return true;
}
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsNeg(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
Mods ^= SISrcMods::NEG;
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsNeg(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
Mods ^= SISrcMods::NEG;
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
// Match BITOP3 operation and return a number of matched instructions plus
// truth table.
static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,

View File

@ -260,11 +260,6 @@ private:
bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
SDValue &SrcMods) const;
bool SelectVOP3PMadMixModsNeg(SDValue In, SDValue &Src,
SDValue &SrcMods) const;
bool SelectVOP3PMadMixBF16ModsNeg(SDValue In, SDValue &Src,
SDValue &SrcMods) const;
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
SDValue &Tbl) const;

View File

@ -6930,21 +6930,6 @@ AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
}};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3PMadMixModsNeg(
MachineOperand &Root) const {
Register Src;
unsigned Mods;
bool Matched;
std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
Mods ^= SISrcMods::NEG;
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};
}
bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
MachineInstr &I, Intrinsic::ID IntrID) const {
MachineBasicBlock *MBB = I.getParent();

View File

@ -343,7 +343,6 @@ private:
bool &Matched) const;
ComplexRendererFns selectVOP3PMadMixModsExt(MachineOperand &Root) const;
ComplexRendererFns selectVOP3PMadMixMods(MachineOperand &Root) const;
ComplexRendererFns selectVOP3PMadMixModsNeg(MachineOperand &Root) const;
void renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx = -1) const;

View File

@ -744,6 +744,7 @@ int FP32_NEG_ONE = 0xbf800000;
int FP64_ONE = 0x3ff0000000000000;
int FP64_NEG_ONE = 0xbff0000000000000;
int BF16_ONE = 0x3F80;
int BF16_NEG_ONE = 0xBF80;
}
def CONST : Constants;

View File

@ -1712,8 +1712,6 @@ def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">;
def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">;
def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">;
def VOP3PMadMixModsNeg : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsNeg">;
def VOP3PMadMixBF16ModsNeg : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsNeg">;
def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;

View File

@ -182,8 +182,8 @@ multiclass MadFmaMixFP32Pats<SDPatternOperator fma_like,
ValueType VT = f16> {
defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
defvar VOP3PMadMixModsNegPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsNeg, VOP3PMadMixModsNeg);
defvar OneImm = !if (!eq(VT, bf16), CONST.BF16_ONE, CONST.FP16_ONE);
defvar NegOneImm = !if (!eq(VT, bf16), CONST.BF16_NEG_ONE, CONST.FP16_NEG_ONE);
// At least one of the operands needs to be an fpextend of an f16
// for this to be worthwhile, so we need three patterns here.
// TODO: Could we use a predicate to inspect src1/2/3 instead?
@ -206,28 +206,33 @@ multiclass MadFmaMixFP32Pats<SDPatternOperator fma_like,
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
// (fadd x, y) -> (fma x, 1.0, y)
def : GCNPat <
(f32 (fadd (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
(f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
(mix_inst $src0_mods, $src0, (i32 8), (i32 OneImm), $src1_mods, $src1,
DSTCLAMP.NONE)>;
// (fmul x, y) -> (fma x, y, 0.0)
// FIXME: This is only valid with nsz.
def : GCNPat <
(f32 (fmul (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
(f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, (i32 0), (i32 0),
DSTCLAMP.NONE)>;
// (fsub x, y) -> (fma y, -1.0, x)
def : GCNPat <
(f32 (fsub (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
(f32 (VOP3PMadMixModsNegPat f32:$src1, i32:$src1_mods)))),
(mix_inst $src0_mods, $src0, (i32 8), (i32 OneImm), $src1_mods, $src1,
(f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
(mix_inst $src1_mods, $src1, (i32 8), (i32 NegOneImm), $src0_mods, $src0,
DSTCLAMP.NONE)>;
// (fsub x, y) -> (fma y, -1.0, x)
def : GCNPat <
(f32 (fsub (f32 (VOP3PMadMixModsNegPat f32:$src0, i32:$src0_mods)),
(f32 (fsub (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_mods)),
(f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)))),
(mix_inst $src0_mods, $src0, (i32 8), (i32 OneImm), $src1_mods, $src1,
(mix_inst $src1_mods, $src1, (i32 8), (i32 NegOneImm), $src0_mods, $src0,
DSTCLAMP.NONE)>;
}

View File

@ -44,10 +44,10 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_mul(<4 x half> %x, <4
; GFX9-DENORM: ; %bb.0: ; %entry
; GFX9-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2
; GFX9-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, 1.0, -v4 op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v2, 1.0, -v5 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v3, 1.0, -v6 op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v3, 1.0, -v7 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v4, -1.0, v2 op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v5, -1.0, v2 op_sel:[0,0,1] op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v6, -1.0, v3 op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v7, -1.0, v3 op_sel:[0,0,1] op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_mul:
@ -72,10 +72,10 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_mul_rhs(<4 x float> %x
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: v_pk_mul_f16 v4, v4, v6
; GFX9-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, -v0, 1.0, v4 op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, -v1, 1.0, v4 op_sel:[0,0,1] op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, -v2, 1.0, v5 op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, -v3, 1.0, v5 op_sel:[0,0,1] op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v4, -1.0, v1 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v5, -1.0, v2 op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v5, -1.0, v3 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_mul_rhs:

View File

@ -85,10 +85,10 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_neg_mul(<4 x half> %x,
; GFX9-DENORM: ; %bb.0: ; %entry
; GFX9-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, 1.0, -v4 op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v2, 1.0, -v5 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v3, 1.0, -v6 op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v3, 1.0, -v7 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v4, -1.0, v2 op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v5, -1.0, v2 op_sel:[0,0,1] op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v6, -1.0, v3 op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v7, -1.0, v3 op_sel:[0,0,1] op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul:
@ -115,10 +115,10 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_neg_ext_mul(<4 x half> %x,
; GFX9-DENORM: ; %bb.0: ; %entry
; GFX9-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, 1.0, -v4 op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v2, 1.0, -v5 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v3, 1.0, -v6 op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v3, 1.0, -v7 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v4, -1.0, v2 op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v5, -1.0, v2 op_sel:[0,0,1] op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v6, -1.0, v3 op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v7, -1.0, v3 op_sel:[0,0,1] op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul:
@ -146,10 +146,10 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_neg_mul2(<4 x float> %
; GFX9-DENORM: ; %bb.0: ; %entry
; GFX9-DENORM-NEXT: v_pk_mul_f16 v4, v4, v6 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, -v0, 1.0, v4 op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, -v1, 1.0, v4 op_sel:[0,0,1] op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, -v2, 1.0, v5 op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, -v3, 1.0, v5 op_sel:[0,0,1] op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v4, -1.0, v1 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v5, -1.0, v2 op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v5, -1.0, v3 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul2:
@ -175,10 +175,10 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_neg_ext_mul2(<4 x float> %
; GFX9-DENORM: ; %bb.0: ; %entry
; GFX9-DENORM-NEXT: v_pk_mul_f16 v4, v4, v6 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, -v0, 1.0, v4 op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, -v1, 1.0, v4 op_sel:[0,0,1] op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, -v2, 1.0, v5 op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, -v3, 1.0, v5 op_sel:[0,0,1] op_sel_hi:[0,1,1]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v4, -1.0, v1 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v5, -1.0, v2 op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v5, -1.0, v3 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul2:

View File

@ -13969,7 +13969,7 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1.0, -v1 op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v1, -1.0, v0 op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@ -14372,9 +14372,9 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX1250TRUE16: ; %bb.0:
; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250TRUE16-NEXT: v_fma_mix_f32_bf16 v1, v1, 1.0, -v3 op_sel_hi:[1,1,1]
; GFX1250TRUE16-NEXT: v_fma_mix_f32_bf16 v3, v0, 1.0, -v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1250TRUE16-NEXT: v_fma_mix_f32_bf16 v0, v0, 1.0, -v2 op_sel_hi:[1,1,1]
; GFX1250TRUE16-NEXT: v_fma_mix_f32_bf16 v1, v3, -1.0, v1 op_sel_hi:[1,1,1]
; GFX1250TRUE16-NEXT: v_fma_mix_f32_bf16 v3, v2, -1.0, v0 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1250TRUE16-NEXT: v_fma_mix_f32_bf16 v0, v2, -1.0, v0 op_sel_hi:[1,1,1]
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250TRUE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
; GFX1250TRUE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
@ -14384,9 +14384,9 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX1250FAKE16: ; %bb.0:
; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250FAKE16-NEXT: v_fma_mix_f32_bf16 v4, v0, 1.0, -v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1250FAKE16-NEXT: v_fma_mix_f32_bf16 v0, v0, 1.0, -v2 op_sel_hi:[1,1,1]
; GFX1250FAKE16-NEXT: v_fma_mix_f32_bf16 v1, v1, 1.0, -v3 op_sel_hi:[1,1,1]
; GFX1250FAKE16-NEXT: v_fma_mix_f32_bf16 v4, v2, -1.0, v0 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1250FAKE16-NEXT: v_fma_mix_f32_bf16 v0, v2, -1.0, v0 op_sel_hi:[1,1,1]
; GFX1250FAKE16-NEXT: v_fma_mix_f32_bf16 v1, v3, -1.0, v1 op_sel_hi:[1,1,1]
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
; GFX1250FAKE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
@ -14670,10 +14670,10 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mix_f32_bf16 v4, v1, 1.0, -v3 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v5, v0, 1.0, -v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1.0, -v2 op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v1, v1, 1.0, -v3 op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v4, v3, -1.0, v1 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v5, v2, -1.0, v0 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v2, -1.0, v0 op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v1, v3, -1.0, v1 op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4
@ -31855,7 +31855,7 @@ define bfloat @v_round_bf16(bfloat %a) {
; GFX1250TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250TRUE16-NEXT: v_trunc_f32_e32 v2, v1
; GFX1250TRUE16-NEXT: v_fma_mix_f32_bf16 v0, v0, 1.0, -v2 op_sel_hi:[1,1,0]
; GFX1250TRUE16-NEXT: v_fma_mix_f32_bf16 v0, v2, -1.0, v0 op_sel_hi:[0,1,1]
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250TRUE16-NEXT: v_cmp_ge_f32_e64 s0, |v0|, 0.5
; GFX1250TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
@ -31873,7 +31873,7 @@ define bfloat @v_round_bf16(bfloat %a) {
; GFX1250FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250FAKE16-NEXT: v_trunc_f32_e32 v2, v1
; GFX1250FAKE16-NEXT: v_fma_mix_f32_bf16 v0, v0, 1.0, -v2 op_sel_hi:[1,1,0]
; GFX1250FAKE16-NEXT: v_fma_mix_f32_bf16 v0, v2, -1.0, v0 op_sel_hi:[0,1,1]
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250FAKE16-NEXT: v_cmp_ge_f32_e64 s0, |v0|, 0.5
; GFX1250FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0

View File

@ -624,7 +624,7 @@ define float @fsub_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v0, 1.0, -v2 op_sel_hi:[1,1,0]
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v2, -1.0, v0 op_sel_hi:[0,1,1]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: fsub_fpext_fmul_f16_to_f32:
@ -632,7 +632,7 @@ define float @fsub_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v0, 1.0, -v2 op_sel_hi:[1,1,0]
; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v2, -1.0, v0 op_sel_hi:[0,1,1]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32:
@ -677,7 +677,7 @@ define float @fsub_fpext_fmul_f16_to_f32_commute(float %x, half %y, half %z) #0
; GFX11-F32DENORM-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-F32DENORM-TRUE16-NEXT: v_mul_f16_e32 v1.l, v1.l, v2.l
; GFX11-F32DENORM-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-F32DENORM-TRUE16-NEXT: v_fma_mix_f32 v0, -v0, 1.0, v1 op_sel_hi:[0,1,1]
; GFX11-F32DENORM-TRUE16-NEXT: v_fma_mix_f32 v0, v1, -1.0, v0 op_sel_hi:[1,1,0]
; GFX11-F32DENORM-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-F32DENORM-FAKE16-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
@ -685,7 +685,7 @@ define float @fsub_fpext_fmul_f16_to_f32_commute(float %x, half %y, half %z) #0
; GFX11-F32DENORM-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-F32DENORM-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-F32DENORM-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-F32DENORM-FAKE16-NEXT: v_fma_mix_f32 v0, -v0, 1.0, v1 op_sel_hi:[0,1,1]
; GFX11-F32DENORM-FAKE16-NEXT: v_fma_mix_f32 v0, v1, -1.0, v0 op_sel_hi:[1,1,0]
; GFX11-F32DENORM-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
@ -724,7 +724,7 @@ define float @fsub_fpext_fneg_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v0, 1.0, -v2 op_sel_hi:[1,1,0]
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v2, -1.0, v0 op_sel_hi:[0,1,1]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: fsub_fpext_fneg_fmul_f16_to_f32:
@ -732,7 +732,7 @@ define float @fsub_fpext_fneg_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v0, 1.0, -v2 op_sel_hi:[1,1,0]
; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v2, -1.0, v0 op_sel_hi:[0,1,1]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-F32FLUSH-LABEL: fsub_fpext_fneg_fmul_f16_to_f32:
@ -772,7 +772,7 @@ define float @fsub_fneg_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v0, 1.0, -v2 op_sel_hi:[1,1,0]
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v2, -1.0, v0 op_sel_hi:[0,1,1]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: fsub_fneg_fpext_fmul_f16_to_f32:
@ -780,7 +780,7 @@ define float @fsub_fneg_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v0, 1.0, -v2 op_sel_hi:[1,1,0]
; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v2, -1.0, v0 op_sel_hi:[0,1,1]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-F32FLUSH-LABEL: fsub_fneg_fpext_fmul_f16_to_f32:
@ -886,7 +886,7 @@ define float @fsub_fpext_muladd_mul_f16_to_f32(half %x, half %y, float %z, half
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v3.l, v3.l, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v3, 1.0, -v2 op_sel_hi:[1,1,0]
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v2, -1.0, v3 op_sel_hi:[0,1,1]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: fsub_fpext_muladd_mul_f16_to_f32:
@ -895,7 +895,7 @@ define float @fsub_fpext_muladd_mul_f16_to_f32(half %x, half %y, float %z, half
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_fmac_f16_e32 v3, v0, v1
; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v3, 1.0, -v2 op_sel_hi:[1,1,0]
; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v2, -1.0, v3 op_sel_hi:[0,1,1]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-F32FLUSH-LABEL: fsub_fpext_muladd_mul_f16_to_f32:
@ -903,7 +903,7 @@ define float @fsub_fpext_muladd_mul_f16_to_f32(half %x, half %y, float %z, half
; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-F32FLUSH-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX9-F32FLUSH-NEXT: v_fma_f16 v0, v0, v1, v3
; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, 1.0, -v2 op_sel_hi:[1,1,0]
; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v2, -1.0, v0 op_sel_hi:[0,1,1]
; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-F32DENORM-LABEL: fsub_fpext_muladd_mul_f16_to_f32:
@ -1004,7 +1004,7 @@ define float @fsub_fpext_muladd_mul_f16_to_f32_commute(float %x, half %y, half %
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v3.l, v3.l, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v1.l, v2.l
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, -v0, 1.0, v3 op_sel_hi:[0,1,1]
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v3, -1.0, v0 op_sel_hi:[1,1,0]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute:
@ -1013,7 +1013,7 @@ define float @fsub_fpext_muladd_mul_f16_to_f32_commute(float %x, half %y, half %
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_fmac_f16_e32 v3, v1, v2
; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, -v0, 1.0, v3 op_sel_hi:[0,1,1]
; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v3, -1.0, v0 op_sel_hi:[1,1,0]
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-F32FLUSH-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute:
@ -1021,7 +1021,7 @@ define float @fsub_fpext_muladd_mul_f16_to_f32_commute(float %x, half %y, half %
; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-F32FLUSH-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX9-F32FLUSH-NEXT: v_fma_f16 v1, v1, v2, v3
; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, -v0, 1.0, v1 op_sel_hi:[0,1,1]
; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v1, -1.0, v0 op_sel_hi:[1,1,0]
; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-F32DENORM-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute:

View File

@ -703,7 +703,7 @@ define float @v_mad_mix_f32_negbf16lo_add_bf16lo(bfloat %src0, bfloat %src1) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v1, 1.0, -v0 op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, -1.0, v1 op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext bfloat %src0 to float
%src1.ext = fpext bfloat %src1 to float
@ -731,7 +731,7 @@ define float @v_mad_mix_f32_negabsbf16lo_add_bf16lo(bfloat %src0, bfloat %src1)
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v1, 1.0, -|v0| op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, -1.0, v1 op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext bfloat %src0 to float
%src1.ext = fpext bfloat %src1 to float
@ -758,7 +758,7 @@ define float @v_mad_mix_f32_bf16lo_add_negf32(bfloat %src0, float %src1) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1.0, -v1 op_sel_hi:[1,1,0]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v1, -1.0, v0 op_sel_hi:[0,1,1]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext bfloat %src0 to float
%src1.neg = fneg float %src1
@ -784,7 +784,7 @@ define float @v_mad_mix_f32_bf16lo_add_negabsf32(bfloat %src0, float %src1) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1.0, -|v1| op_sel_hi:[1,1,0]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v1|, -1.0, v0 op_sel_hi:[0,1,1]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext bfloat %src0 to float
%src1.abs = call float @llvm.fabs.f32(float %src1)
@ -838,7 +838,7 @@ define float @v_mad_mix_f32_negprecvtbf16lo_add_bf16lo(i32 %src0.arg, bfloat %sr
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v1, 1.0, -v0 op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, -1.0, v1 op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
%src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 0
@ -870,7 +870,7 @@ define float @v_mad_mix_f32_negabsprecvtbf16lo_add_bf16lo(i32 %src0.arg, bfloat
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v1, 1.0, -|v0| op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, -1.0, v1 op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
%src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 0
@ -1324,7 +1324,7 @@ define float @v_mad_mix_f32_bf16lo_sub_bf16lo(bfloat %src0, bfloat %src1) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1.0, -v1 op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v1, -1.0, v0 op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext bfloat %src0 to float
%src1.ext = fpext bfloat %src1 to float
@ -1337,7 +1337,7 @@ define float @v_mad_mix_f32_absbf16lo_sub_bf16lo(bfloat %src0, bfloat %src1) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, 1.0, -v1 op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v1, -1.0, |v0| op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext bfloat %src0 to float
%src1.ext = fpext bfloat %src1 to float
@ -1351,7 +1351,7 @@ define float @v_mad_mix_f32_bf16hi_fsub_bf16hi(i32 %src0, i32 %src1) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1.0, -v1 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v1, -1.0, v0 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.hi = lshr i32 %src0, 16
%src1.hi = lshr i32 %src1, 16
@ -1370,7 +1370,7 @@ define float @v_mad_mix_f32_absbf16hi_fsub_bf16hi(i32 %src0, i32 %src1) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, 1.0, -v1 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v1, -1.0, |v0| op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.hi = lshr i32 %src0, 16
%src1.hi = lshr i32 %src1, 16

View File

@ -2946,7 +2946,7 @@ define float @v_mad_mix_f32_negf16lo_add_f16lo(half %src0, half %src1) {
; SDAG-GFX1100-LABEL: v_mad_mix_f32_negf16lo_add_f16lo:
; SDAG-GFX1100: ; %bb.0:
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v1, 1.0, -v0 op_sel_hi:[1,1,1]
; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,1]
; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX900-LABEL: v_mad_mix_f32_negf16lo_add_f16lo:
@ -2960,7 +2960,7 @@ define float @v_mad_mix_f32_negf16lo_add_f16lo(half %src0, half %src1) {
; SDAG-GFX906-LABEL: v_mad_mix_f32_negf16lo_add_f16lo:
; SDAG-GFX906: ; %bb.0:
; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v1, 1.0, -v0 op_sel_hi:[1,1,1]
; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,1]
; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_negf16lo_add_f16lo:
@ -3100,7 +3100,7 @@ define float @v_mad_mix_f32_negabsf16lo_add_f16lo(half %src0, half %src1) {
; GFX1100-LABEL: v_mad_mix_f32_negabsf16lo_add_f16lo:
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_fma_mix_f32 v0, v1, 1.0, -|v0| op_sel_hi:[1,1,1]
; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, -1.0, v1 op_sel_hi:[1,1,1]
; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_f32_negabsf16lo_add_f16lo:
@ -3114,7 +3114,7 @@ define float @v_mad_mix_f32_negabsf16lo_add_f16lo(half %src0, half %src1) {
; GFX906-LABEL: v_mad_mix_f32_negabsf16lo_add_f16lo:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_fma_mix_f32 v0, v1, 1.0, -|v0| op_sel_hi:[1,1,1]
; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, -1.0, v1 op_sel_hi:[1,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX9GEN-LABEL: v_mad_mix_f32_negabsf16lo_add_f16lo:
@ -3205,7 +3205,7 @@ define float @v_mad_mix_f32_f16lo_add_negf32(half %src0, float %src1) {
; GFX1100-LABEL: v_mad_mix_f32_f16lo_add_negf32:
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_fma_mix_f32 v0, v0, 1.0, -v1 op_sel_hi:[1,1,0]
; GFX1100-NEXT: v_fma_mix_f32 v0, v1, -1.0, v0 op_sel_hi:[0,1,1]
; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_f32_f16lo_add_negf32:
@ -3218,7 +3218,7 @@ define float @v_mad_mix_f32_f16lo_add_negf32(half %src0, float %src1) {
; GFX906-LABEL: v_mad_mix_f32_f16lo_add_negf32:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_fma_mix_f32 v0, v0, 1.0, -v1 op_sel_hi:[1,1,0]
; GFX906-NEXT: v_fma_mix_f32 v0, v1, -1.0, v0 op_sel_hi:[0,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_add_negf32:
@ -3297,7 +3297,7 @@ define float @v_mad_mix_f32_f16lo_add_negabsf32(half %src0, float %src1) {
; GFX1100-LABEL: v_mad_mix_f32_f16lo_add_negabsf32:
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_fma_mix_f32 v0, v0, 1.0, -|v1| op_sel_hi:[1,1,0]
; GFX1100-NEXT: v_fma_mix_f32 v0, |v1|, -1.0, v0 op_sel_hi:[0,1,1]
; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_f32_f16lo_add_negabsf32:
@ -3310,7 +3310,7 @@ define float @v_mad_mix_f32_f16lo_add_negabsf32(half %src0, float %src1) {
; GFX906-LABEL: v_mad_mix_f32_f16lo_add_negabsf32:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_fma_mix_f32 v0, v0, 1.0, -|v1| op_sel_hi:[1,1,0]
; GFX906-NEXT: v_fma_mix_f32 v0, |v1|, -1.0, v0 op_sel_hi:[0,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_add_negabsf32:
@ -3491,7 +3491,7 @@ define float @v_mad_mix_f32_negprecvtf16lo_add_f16lo(i32 %src0.arg, half %src1)
; SDAG-GFX1100-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo:
; SDAG-GFX1100: ; %bb.0:
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v1, 1.0, -v0 op_sel_hi:[1,1,1]
; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,1]
; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX900-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo:
@ -3505,7 +3505,7 @@ define float @v_mad_mix_f32_negprecvtf16lo_add_f16lo(i32 %src0.arg, half %src1)
; SDAG-GFX906-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo:
; SDAG-GFX906: ; %bb.0:
; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v1, 1.0, -v0 op_sel_hi:[1,1,1]
; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel_hi:[1,1,1]
; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo:
@ -3641,7 +3641,7 @@ define float @v_mad_mix_f32_negabsprecvtf16lo_add_f16lo(i32 %src0.arg, half %src
; SDAG-GFX1100-LABEL: v_mad_mix_f32_negabsprecvtf16lo_add_f16lo:
; SDAG-GFX1100: ; %bb.0:
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v1, 1.0, -|v0| op_sel_hi:[1,1,1]
; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, -1.0, v1 op_sel_hi:[1,1,1]
; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX900-LABEL: v_mad_mix_f32_negabsprecvtf16lo_add_f16lo:
@ -3655,7 +3655,7 @@ define float @v_mad_mix_f32_negabsprecvtf16lo_add_f16lo(i32 %src0.arg, half %src
; SDAG-GFX906-LABEL: v_mad_mix_f32_negabsprecvtf16lo_add_f16lo:
; SDAG-GFX906: ; %bb.0:
; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v1, 1.0, -|v0| op_sel_hi:[1,1,1]
; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, |v0|, -1.0, v1 op_sel_hi:[1,1,1]
; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_negabsprecvtf16lo_add_f16lo:
@ -3880,7 +3880,7 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_add_f16lo(i32 %src0.arg, half %
; SDAG-GFX1100-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo:
; SDAG-GFX1100: ; %bb.0:
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v1, 1.0, -v0 op_sel:[0,0,1] op_sel_hi:[1,1,1]
; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel:[1,0,0] op_sel_hi:[1,1,1]
; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX900-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo:
@ -3894,7 +3894,7 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_add_f16lo(i32 %src0.arg, half %
; SDAG-GFX906-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo:
; SDAG-GFX906: ; %bb.0:
; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v1, 1.0, -v0 op_sel:[0,0,1] op_sel_hi:[1,1,1]
; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, -1.0, v1 op_sel:[1,0,0] op_sel_hi:[1,1,1]
; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo:
@ -4074,7 +4074,7 @@ define float @v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo(i32 %src0.arg, ha
; SDAG-GFX1100-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo:
; SDAG-GFX1100: ; %bb.0:
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v1, 1.0, -|v0| op_sel:[0,0,1] op_sel_hi:[1,1,1]
; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, -1.0, v1 op_sel:[1,0,0] op_sel_hi:[1,1,1]
; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX900-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo:
@ -4088,7 +4088,7 @@ define float @v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo(i32 %src0.arg, ha
; SDAG-GFX906-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo:
; SDAG-GFX906: ; %bb.0:
; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v1, 1.0, -|v0| op_sel:[0,0,1] op_sel_hi:[1,1,1]
; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, |v0|, -1.0, v1 op_sel:[1,0,0] op_sel_hi:[1,1,1]
; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo:
@ -5658,7 +5658,7 @@ define float @v_mad_mix_f32_f16lo_sub_f16lo(half %src0, half %src1) {
; GFX1100-LABEL: v_mad_mix_f32_f16lo_sub_f16lo:
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_fma_mix_f32 v0, v0, 1.0, -v1 op_sel_hi:[1,1,1]
; GFX1100-NEXT: v_fma_mix_f32 v0, v1, -1.0, v0 op_sel_hi:[1,1,1]
; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_f32_f16lo_sub_f16lo:
@ -5672,7 +5672,7 @@ define float @v_mad_mix_f32_f16lo_sub_f16lo(half %src0, half %src1) {
; GFX906-LABEL: v_mad_mix_f32_f16lo_sub_f16lo:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_fma_mix_f32 v0, v0, 1.0, -v1 op_sel_hi:[1,1,1]
; GFX906-NEXT: v_fma_mix_f32 v0, v1, -1.0, v0 op_sel_hi:[1,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_sub_f16lo:
@ -5708,7 +5708,7 @@ define float @v_mad_mix_f32_absf16lo_sub_f16lo(half %src0, half %src1) {
; GFX1100-LABEL: v_mad_mix_f32_absf16lo_sub_f16lo:
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, 1.0, -v1 op_sel_hi:[1,1,1]
; GFX1100-NEXT: v_fma_mix_f32 v0, v1, -1.0, |v0| op_sel_hi:[1,1,1]
; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_f32_absf16lo_sub_f16lo:
@ -5722,7 +5722,7 @@ define float @v_mad_mix_f32_absf16lo_sub_f16lo(half %src0, half %src1) {
; GFX906-LABEL: v_mad_mix_f32_absf16lo_sub_f16lo:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, 1.0, -v1 op_sel_hi:[1,1,1]
; GFX906-NEXT: v_fma_mix_f32 v0, v1, -1.0, |v0| op_sel_hi:[1,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX9GEN-LABEL: v_mad_mix_f32_absf16lo_sub_f16lo:
@ -5767,7 +5767,7 @@ define float @v_mad_mix_f32_f16hi_fsub_f16hi(i32 %src0, i32 %src1) {
; GFX1100-LABEL: v_mad_mix_f32_f16hi_fsub_f16hi:
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_fma_mix_f32 v0, v0, 1.0, -v1 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1100-NEXT: v_fma_mix_f32 v0, v1, -1.0, v0 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_f32_f16hi_fsub_f16hi:
@ -5781,7 +5781,7 @@ define float @v_mad_mix_f32_f16hi_fsub_f16hi(i32 %src0, i32 %src1) {
; GFX906-LABEL: v_mad_mix_f32_f16hi_fsub_f16hi:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_fma_mix_f32 v0, v0, 1.0, -v1 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX906-NEXT: v_fma_mix_f32 v0, v1, -1.0, v0 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX9GEN-LABEL: v_mad_mix_f32_f16hi_fsub_f16hi:
@ -5825,7 +5825,7 @@ define float @v_mad_mix_f32_absf16hi_fsub_f16hi(i32 %src0, i32 %src1) {
; GFX1100-LABEL: v_mad_mix_f32_absf16hi_fsub_f16hi:
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, 1.0, -v1 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1100-NEXT: v_fma_mix_f32 v0, v1, -1.0, |v0| op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_f32_absf16hi_fsub_f16hi:
@ -5839,7 +5839,7 @@ define float @v_mad_mix_f32_absf16hi_fsub_f16hi(i32 %src0, i32 %src1) {
; GFX906-LABEL: v_mad_mix_f32_absf16hi_fsub_f16hi:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, 1.0, -v1 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX906-NEXT: v_fma_mix_f32 v0, v1, -1.0, |v0| op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX9GEN-LABEL: v_mad_mix_f32_absf16hi_fsub_f16hi: