From 5226289b8ea27ba53586d2c4db6d72368b0924d4 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 2 Apr 2026 16:05:08 +0200 Subject: [PATCH] Revert "AMDGPU: Codegen for v_dual_dot2acc_f32_f16/bf16 from VOP3" (#190159) This reverts commit 47f6a19181b426baa03182ab6a7a41e16b35301d. Breaks MIOpen, don't have propper fix yet. --- llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp | 33 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 - .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 6 - llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 13 +- llvm/lib/Target/AMDGPU/VOPInstructions.td | 4 +- .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 124 +-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll | 766 ++++++++++++------ 7 files changed, 601 insertions(+), 353 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp index 4659dcd1a78c..b17cabf37d53 100644 --- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp @@ -34,36 +34,6 @@ using namespace llvm; #define DEBUG_TYPE "gcn-vopd-utils" -// Check if MI is a VOP3P instruction with operands that satisfy the constraints -// for mapping it to a VOP2/VOPD opcode: no modifiers, no clamp, src1 and src2 -// are registers (src0 can be register or literal), and src2 is same as dst. -static bool canMapVOP3PToVOPD(const MachineInstr &MI) { - unsigned Opc = MI.getOpcode(); - if (Opc != AMDGPU::V_DOT2_F32_F16 && Opc != AMDGPU::V_DOT2_F32_BF16) - return false; - // src0 can be register or literal - int16_t Src0ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); - if (MI.getOperand(Src0ModsIdx).getImm() != SISrcMods::OP_SEL_1) - return false; - int16_t Src1ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); - if (MI.getOperand(Src1ModsIdx).getImm() != SISrcMods::OP_SEL_1) - return false; - int16_t Src1Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1); - if (!MI.getOperand(Src1Idx).isReg()) - return false; - int16_t Src2ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers); - if (MI.getOperand(Src2ModsIdx).getImm() != SISrcMods::OP_SEL_1) - return false; - int16_t Src2Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2); - if (!MI.getOperand(Src2Idx).isReg()) - return false; - int16_t ClampIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::clamp); - if (MI.getOperand(ClampIdx).getImm() != 0) - return false; - int16_t VdstIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); - return MI.getOperand(VdstIdx).getReg() == MI.getOperand(Src2Idx).getReg(); -} - bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, const MachineInstr &MIX, const MachineInstr &MIY, bool IsVOPD3) { @@ -74,8 +44,7 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, if (IsVOPD3 && !ST.hasVOPD3()) return false; - if (!IsVOPD3 && ((TII.isVOP3(MIX) && !canMapVOP3PToVOPD(MIX)) || - (TII.isVOP3(MIY) && !canMapVOP3PToVOPD(MIY)))) + if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY))) return false; if (TII.isDPP(MIX) || TII.isDPP(MIY)) return false; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0bc509c4a6b2..2e631d2f4a55 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7012,14 +7012,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOperand(0).setReg(OriginalExec); return BB; } - case AMDGPU::V_DOT2_F32_F16: - case AMDGPU::V_DOT2_F32_BF16: { - // Hint RA to assign dst and src2 the same physical register. - // For targets without VOP2, but with VOPD, variant of the instruction this - // is one of the conditions to attempt converting VOP3P to VOPD. - MRI.setSimpleHint(MI.getOperand(0).getReg(), MI.getOperand(6).getReg()); - return BB; - } default: if (TII->isImage(MI) || TII->isMUBUF(MI)) { if (!MI.mayStore()) diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 2b617e54bdfe..749cead8a20f 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -928,12 +928,6 @@ ComponentProps::ComponentProps(const MCInstrDesc &OpDesc, bool VOP3Layout) { NumVOPD3Mods = 2; if (IsVOP3) SrcOperandsNum = 3; - } else if (Opcode == AMDGPU::V_DOT2_F32_F16 || - Opcode == AMDGPU::V_DOT2_F32_BF16) { - // VOP3P opcodes that have VOPD but don't have VOP2 version. Using VOPD3 - // path in getIndexOfSrcInMCOperands to get correct src operand indexes, - // but generating VOPD, not VOPD3. - NumVOPD3Mods = SrcOperandsNum; } else if (isSISrcFPOperand(OpDesc, getNamedOperandIdx(Opcode, OpName::src0))) { // All FP VOPD instructions have Neg modifiers for all operands except diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 554273675077..d8665739e150 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -87,13 +87,11 @@ multiclass VOP3PInst VOPDOp, string VOPDName> { + SDPatternOperator node = null_frag> { def NAME : VOP3P_Pseudo.ret>, - VOPD_Component; + VOP3PModsDOT, VOP3PModsF32>.ret>; let SubtargetPredicate = isGFX11Plus in { if P.HasExtVOP3DPP then def _dpp : VOP3_DPP_Pseudo { @@ -614,12 +612,12 @@ defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3P_Profile, int_amdgcn_udot2, 1>; } // End OtherPredicates = [HasDot2Insts] -let OtherPredicates = [HasDot10Insts], isCommutable = 1, usesCustomInserter = 1 in +let OtherPredicates = [HasDot10Insts] in defm V_DOT2_F32_F16 : VOP3PInstDotWithDual<"v_dot2_f32_f16", VOP3P_Profile, - AMDGPUfdot2, 0xC, "v_dot2acc_f32_f16">; + AMDGPUfdot2>; let OtherPredicates = [HasDot7Insts] in { defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", @@ -642,10 +640,9 @@ def DOT2_BF16_Profile let SubtargetPredicate = HasDot12Insts in { -let isCommutable = 1, usesCustomInserter = 1 in defm V_DOT2_F32_BF16 : VOP3PInstDotWithDual<"v_dot2_f32_bf16", DOT2_BF16_Profile, - int_amdgcn_fdot2_f32_bf16, 0xD, "v_dot2acc_f32_bf16">; + int_amdgcn_fdot2_f32_bf16>; } // End SubtargetPredicate = HasDot12Insts diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 40b0476a84d2..82545a472cf1 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -34,8 +34,8 @@ class VOP { string OpName = opName; } -// First 13 insts from VOPDY are also VOPDX. -defvar VOPDX_Max_Index = 13; +// First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted +defvar VOPDX_Max_Index = 12; defvar VOPD3X_Max_Index = 36; class VOPD_Component OpIn, string vOPDName> { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index 9e5a8c672deb..1dff54ac3542 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -225,7 +225,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b(<2 x bfloat> %a, float %c) { ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v0, v1 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v1 %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> , float %c, i1 false) ret float %ret } @@ -373,7 +373,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b_clamp(<2 x bfloat> %a, float %c) ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_clamp: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v0, v1 clamp +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v1 clamp %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> , float %c, i1 true) ret float %ret } @@ -395,8 +395,9 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, v3, v4 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -406,15 +407,15 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %neg.a = fneg <2 x bfloat> %a %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -435,8 +436,9 @@ define float @v_fdot2_f32_bf16_neg_a_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_xor_b16 v0.l, 0x8000, v0.l -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %a_lo = extractelement <2 x bfloat> %a, i32 0 %neg.a_lo = fneg bfloat %a_lo %neg_lo.a = insertelement <2 x bfloat> %a, bfloat %neg.a_lo, i32 0 @@ -460,8 +462,9 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_xor_b16 v0.h, 0x8000, v0.h -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %a_hi = extractelement <2 x bfloat> %a, i32 1 %neg.a_hi = fneg bfloat %a_hi %neg_hi.a = insertelement <2 x bfloat> %a, bfloat %neg.a_hi, i32 1 @@ -474,15 +477,15 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %neg.b = fneg <2 x bfloat> %b %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -503,8 +506,9 @@ define float @v_fdot2_f32_bf16_neg_b_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_xor_b16 v1.l, 0x8000, v1.l -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %b_lo = extractelement <2 x bfloat> %b, i32 0 %neg.b_lo = fneg bfloat %b_lo %neg_lo.b = insertelement <2 x bfloat> %b, bfloat %neg.b_lo, i32 0 @@ -528,8 +532,9 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_xor_b16 v1.h, 0x8000, v1.h -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %b_hi = extractelement <2 x bfloat> %b, i32 1 %neg.b_hi = fneg bfloat %b_hi %neg_hi.b = insertelement <2 x bfloat> %b, bfloat %neg.b_hi, i32 1 @@ -542,15 +547,15 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_neg_c_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1] +; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1] -; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %neg.c = fneg float %c %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -561,15 +566,15 @@ define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_abs_c_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1] +; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1] -; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %abs.c = call float @llvm.fabs.f32(float %c) %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -589,8 +594,9 @@ define float @v_fdot2_f32_bf16_opsel_lo_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_mov_b16_e32 v0.l, v0.h -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -610,8 +616,9 @@ define float @v_fdot2_f32_bf16_opsel_hi_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_mov_b16_e32 v0.h, v0.l -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -631,8 +638,9 @@ define float @v_fdot2_f32_bf16_opsel_lo_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_mov_b16_e32 v1.l, v1.h -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -652,8 +660,9 @@ define float @v_fdot2_f32_bf16_opsel_hi_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_mov_b16_e32 v1.h, v1.l -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) @@ -673,8 +682,9 @@ define float @v_fdot2_f32_bf16_inline_literal_a_y(<2 x bfloat> %a, <2 x bfloat> ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_y: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, 0x40004000, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> , <2 x bfloat> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -690,8 +700,9 @@ define float @v_fdot2_f32_bf16_inline_literal_a_xy(<2 x bfloat> %a, <2 x bfloat> ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_xy: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, 0x40004000, v4, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> , <2 x bfloat> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> , <2 x bfloat> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -707,8 +718,9 @@ define float @v_fdot2_f32_bf16_inline_literal_b_x(<2 x bfloat> %a, <2 x bfloat> ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_x: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_bf16 v5, v4, v3 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, v4, v3, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> , float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %e, <2 x bfloat> %d, float %f, i1 false) %r = fadd float %r0, %r1 @@ -724,8 +736,9 @@ define float @v_fdot2_f32_bf16_inline_literal_b_y(<2 x bfloat> %a, <2 x bfloat> ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_y: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v1, v0 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v3 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v1, v0, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, 0x40004000, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %b, <2 x bfloat> %a, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> , float %f, i1 false) %r = fadd float %r0, %r1 @@ -741,8 +754,9 @@ define float @v_fdot2_f32_bf16_inline_literal_b_xy(<2 x bfloat> %a, <2 x bfloat> ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_xy: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v3 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v2 +; GFX11PLUS: v_dot2_f32_bf16 v1, v3, 0x40004000, v5 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> , float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> , float %f, i1 false) %r = fadd float %r0, %r1 @@ -760,8 +774,8 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x bfloat> %a, <2 x bfloa ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_c_dual: ; GFX11PLUS: ; %bb.0: ; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, 2.0 -; GFX11PLUS: v_dot2_f32_bf16 v4, v2, v3, v4 -; GFX11PLUS: v_add_f32_e32 v0, v0, v4 +; GFX11PLUS: v_dot2_f32_bf16 v1, v2, v3, v4 +; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %r0 = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float 2.0, i1 false) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -771,9 +785,9 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x bfloat> %a, <2 x bfloa define float @v_fdot2_f32_bf16_clamp_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GCN-LABEL: v_fdot2_f32_bf16_clamp_dual: ; GCN: ; %bb.0: -; GCN: v_dot2_f32_bf16 v2, v0, v1, v2 clamp -; GCN: v_dot2_f32_bf16 v5, v3, v4, v5 clamp -; GCN: v_add_f32_e32 v0, v2, v5 +; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 clamp +; GCN: v_dot2_f32_bf16 v1, v3, v4, v5 clamp +; GCN: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 true) %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 true) %r = fadd float %r0, %r1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll index 73410d8f32ff..a16cc091eb76 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll @@ -3,8 +3,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck %s --check-prefixes=GCN,GFX950 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s --check-prefixes=GCN,GFX10 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX11 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX1170-GFX12 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX1170-GFX12 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX1170 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX12 declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp) @@ -28,9 +28,13 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-GFX12-LABEL: v_fdot2: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-LABEL: v_fdot2: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; +; GFX12-LABEL: v_fdot2: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) ret float %r } @@ -67,10 +71,15 @@ define float @v_fdot2_neg_a_lo(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-GFX12-LABEL: v_fdot2_neg_a_lo: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_xor_b16 v0.l, 0x8000, v0.l -; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-LABEL: v_fdot2_neg_a_lo: +; GFX1170: ; %bb.0: +; GFX1170: v_xor_b16 v0.l, 0x8000, v0.l +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; +; GFX12-LABEL: v_fdot2_neg_a_lo: +; GFX12: ; %bb.0: +; GFX12: v_xor_b16 v0.l, 0x8000, v0.l +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %a_lo = extractelement <2 x half> %a, i32 0 %neg.a_lo = fneg half %a_lo %neg_lo.a = insertelement <2 x half> %a, half %neg.a_lo, i32 0 @@ -102,10 +111,15 @@ define float @v_fdot2_neg_a_hi(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-GFX12-LABEL: v_fdot2_neg_a_hi: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_xor_b16 v0.h, 0x8000, v0.h -; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-LABEL: v_fdot2_neg_a_hi: +; GFX1170: ; %bb.0: +; GFX1170: v_xor_b16 v0.h, 0x8000, v0.h +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; +; GFX12-LABEL: v_fdot2_neg_a_hi: +; GFX12: ; %bb.0: +; GFX12: v_xor_b16 v0.h, 0x8000, v0.h +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %a_hi = extractelement <2 x half> %a, i32 1 %neg.a_hi = fneg half %a_hi %neg_hi.a = insertelement <2 x half> %a, half %neg.a_hi, i32 1 @@ -145,10 +159,15 @@ define float @v_fdot2_neg_b_lo(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-GFX12-LABEL: v_fdot2_neg_b_lo: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_xor_b16 v1.l, 0x8000, v1.l -; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-LABEL: v_fdot2_neg_b_lo: +; GFX1170: ; %bb.0: +; GFX1170: v_xor_b16 v1.l, 0x8000, v1.l +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; +; GFX12-LABEL: v_fdot2_neg_b_lo: +; GFX12: ; %bb.0: +; GFX12: v_xor_b16 v1.l, 0x8000, v1.l +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %b_lo = extractelement <2 x half> %b, i32 0 %neg.b_lo = fneg half %b_lo %neg_lo.b = insertelement <2 x half> %b, half %neg.b_lo, i32 0 @@ -180,10 +199,15 @@ define float @v_fdot2_neg_b_hi(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-GFX12-LABEL: v_fdot2_neg_b_hi: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_xor_b16 v1.h, 0x8000, v1.h -; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-LABEL: v_fdot2_neg_b_hi: +; GFX1170: ; %bb.0: +; GFX1170: v_xor_b16 v1.h, 0x8000, v1.h +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; +; GFX12-LABEL: v_fdot2_neg_b_hi: +; GFX12: ; %bb.0: +; GFX12: v_xor_b16 v1.h, 0x8000, v1.h +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %b_hi = extractelement <2 x half> %b, i32 1 %neg.b_hi = fneg half %b_hi %neg_hi.b = insertelement <2 x half> %b, half %neg.b_hi, i32 1 @@ -231,10 +255,15 @@ define float @v_fdot2_opsel_lo_a(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-GFX12-LABEL: v_fdot2_opsel_lo_a: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_mov_b16_e32 v0.l, v0.h -; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-LABEL: v_fdot2_opsel_lo_a: +; GFX1170: ; %bb.0: +; GFX1170: v_mov_b16_e32 v0.l, v0.h +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; +; GFX12-LABEL: v_fdot2_opsel_lo_a: +; GFX12: ; %bb.0: +; GFX12: v_mov_b16_e32 v0.l, v0.h +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false) ret float %r @@ -262,10 +291,15 @@ define float @v_fdot2_opsel_hi_a(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-GFX12-LABEL: v_fdot2_opsel_hi_a: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_mov_b16_e32 v0.h, v0.l -; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-LABEL: v_fdot2_opsel_hi_a: +; GFX1170: ; %bb.0: +; GFX1170: v_mov_b16_e32 v0.h, v0.l +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; +; GFX12-LABEL: v_fdot2_opsel_hi_a: +; GFX12: ; %bb.0: +; GFX12: v_mov_b16_e32 v0.h, v0.l +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false) ret float %r @@ -293,10 +327,15 @@ define float @v_fdot2_opsel_lo_b(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-GFX12-LABEL: v_fdot2_opsel_lo_b: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_mov_b16_e32 v1.l, v1.h -; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-LABEL: v_fdot2_opsel_lo_b: +; GFX1170: ; %bb.0: +; GFX1170: v_mov_b16_e32 v1.l, v1.h +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; +; GFX12-LABEL: v_fdot2_opsel_lo_b: +; GFX12: ; %bb.0: +; GFX12: v_mov_b16_e32 v1.l, v1.h +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false) ret float %r @@ -324,10 +363,15 @@ define float @v_fdot2_opsel_hi_b(<2 x half> %a, <2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-GFX12-LABEL: v_fdot2_opsel_hi_b: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_mov_b16_e32 v1.h, v1.l -; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170-LABEL: v_fdot2_opsel_hi_b: +; GFX1170: ; %bb.0: +; GFX1170: v_mov_b16_e32 v1.h, v1.l +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; +; GFX12-LABEL: v_fdot2_opsel_hi_b: +; GFX12: ; %bb.0: +; GFX12: v_mov_b16_e32 v1.h, v1.l +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false) ret float %r @@ -353,9 +397,13 @@ define float @v_fdot2_inline_literal_a(<2 x half> %b, float %c) { ; GFX11: v_dot2acc_f32_f16 v1, 0x40004000, v0 ; GFX11: v_mov_b32_e32 v0, v1 ; -; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_a: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_dot2_f32_f16 v0, 0x40004000, v0, v1 +; GFX1170-LABEL: v_fdot2_inline_literal_a: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, 0x40004000, v0, v1 +; +; GFX12-LABEL: v_fdot2_inline_literal_a: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, 0x40004000, v0, v1 %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> , <2 x half> %b, float %c, i1 false) ret float %ret } @@ -380,9 +428,13 @@ define float @v_fdot2_inline_literal_b(<2 x half> %a, float %c) { ; GFX11: v_dot2acc_f32_f16 v1, 0x40004000, v0 ; GFX11: v_mov_b32_e32 v0, v1 ; -; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_b: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_dot2_f32_f16 v0, 0x40004000, v0, v1 +; GFX1170-LABEL: v_fdot2_inline_literal_b: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, v0, 0x40004000, v1 +; +; GFX12-LABEL: v_fdot2_inline_literal_b: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, v0, 0x40004000, v1 %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> , float %c, i1 false) ret float %ret } @@ -410,9 +462,13 @@ define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x half> %b) { ; GFX11: v_dot2acc_f32_f16 v2, v0, v1 ; GFX11: v_mov_b32_e32 v0, v2 ; -; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_c: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, 2.0 +; GFX1170-LABEL: v_fdot2_inline_literal_c: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, v0, v1, 2.0 +; +; GFX12-LABEL: v_fdot2_inline_literal_c: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, v0, v1, 2.0 %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 2.0, i1 false) ret float %ret } @@ -594,7 +650,7 @@ define float @v_fdot2_inline_literal_b_clamp(<2 x half> %a, float %c) { ; ; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_clamp: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_f16 v0, 0x40004000, v0, v1 clamp +; GFX11PLUS: v_dot2_f32_f16 v0, v0, 0x40004000, v1 clamp %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> , float %c, i1 true) ret float %ret } @@ -610,9 +666,9 @@ define float @v_fdot2_inline_literal_c_clamp(<2 x half> %a, <2 x half> %b) { define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_dual: ; GFX950: ; %bb.0: @@ -626,10 +682,22 @@ define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d ; GFX10: v_dot2c_f32_f16 v5, v3, v4 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11PLUS-LABEL: v_fdot2_dual: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, v3, v4 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_dual: +; GFX11: ; %bb.0: +; GFX11: v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_dual: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -639,33 +707,39 @@ define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_a_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_neg_a_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX10-LABEL: v_fdot2_neg_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_a_dual: ; GFX11: ; %bb.0: -; GFX11: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX11: v_dot2acc_f32_f16 v5, v3, v4 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_add_f32_e32 v0, v0, v5 ; -; GFX1170-GFX12-LABEL: v_fdot2_neg_a_dual: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX1170-GFX12: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX1170-GFX12: v_add_f32_e32 v0, v2, v5 +; GFX1170-LABEL: v_fdot2_neg_a_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_neg_a_dual: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %neg.a = fneg <2 x half> %a %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -676,9 +750,9 @@ define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_a_lo_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_neg_a_lo_dual: ; GFX950: ; %bb.0: @@ -691,15 +765,29 @@ define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_a_lo_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; -; GFX11PLUS-LABEL: v_fdot2_neg_a_lo_dual: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_xor_b16 v0.l, 0x8000, v0.l -; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_neg_a_lo_dual: +; GFX11: ; %bb.0: +; GFX11: v_xor_b16 v0.l, 0x8000, v0.l +; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_neg_a_lo_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_xor_b16 v0.l, 0x8000, v0.l +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_neg_a_lo_dual: +; GFX12: ; %bb.0: +; GFX12: v_xor_b16 v0.l, 0x8000, v0.l +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %a_lo = extractelement <2 x half> %a, i32 0 %neg.a_lo = fneg half %a_lo %neg_lo.a = insertelement <2 x half> %a, half %neg.a_lo, i32 0 @@ -712,9 +800,9 @@ define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_a_hi_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[1,0,0] -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0] +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_neg_a_hi_dual: ; GFX950: ; %bb.0: @@ -728,15 +816,29 @@ define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_a_hi_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[1,0,0] +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; -; GFX11PLUS-LABEL: v_fdot2_neg_a_hi_dual: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_xor_b16 v0.h, 0x8000, v0.h -; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_neg_a_hi_dual: +; GFX11: ; %bb.0: +; GFX11: v_xor_b16 v0.h, 0x8000, v0.h +; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_neg_a_hi_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_xor_b16 v0.h, 0x8000, v0.h +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_neg_a_hi_dual: +; GFX12: ; %bb.0: +; GFX12: v_xor_b16 v0.h, 0x8000, v0.h +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %a_hi = extractelement <2 x half> %a, i32 1 %neg.a_hi = fneg half %a_hi %neg_hi.a = insertelement <2 x half> %a, half %neg.a_hi, i32 1 @@ -749,33 +851,39 @@ define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_b_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_neg_b_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX10-LABEL: v_fdot2_neg_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_b_dual: ; GFX11: ; %bb.0: -; GFX11: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX11: v_dot2acc_f32_f16 v5, v3, v4 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_add_f32_e32 v0, v0, v5 ; -; GFX1170-GFX12-LABEL: v_fdot2_neg_b_dual: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX1170-GFX12: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX1170-GFX12: v_add_f32_e32 v0, v2, v5 +; GFX1170-LABEL: v_fdot2_neg_b_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_neg_b_dual: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %neg.b = fneg <2 x half> %b %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -786,9 +894,9 @@ define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_b_lo_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_neg_b_lo_dual: ; GFX950: ; %bb.0: @@ -801,15 +909,29 @@ define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_b_lo_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; -; GFX11PLUS-LABEL: v_fdot2_neg_b_lo_dual: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_xor_b16 v1.l, 0x8000, v1.l -; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_neg_b_lo_dual: +; GFX11: ; %bb.0: +; GFX11: v_xor_b16 v1.l, 0x8000, v1.l +; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_neg_b_lo_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_xor_b16 v1.l, 0x8000, v1.l +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_neg_b_lo_dual: +; GFX12: ; %bb.0: +; GFX12: v_xor_b16 v1.l, 0x8000, v1.l +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %b_lo = extractelement <2 x half> %b, i32 0 %neg.b_lo = fneg half %b_lo %neg_lo.b = insertelement <2 x half> %b, half %neg.b_lo, i32 0 @@ -822,9 +944,9 @@ define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_b_hi_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,1,0] -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0] +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_neg_b_hi_dual: ; GFX950: ; %bb.0: @@ -838,15 +960,29 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_b_hi_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,1,0] +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; -; GFX11PLUS-LABEL: v_fdot2_neg_b_hi_dual: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_xor_b16 v1.h, 0x8000, v1.h -; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_neg_b_hi_dual: +; GFX11: ; %bb.0: +; GFX11: v_xor_b16 v1.h, 0x8000, v1.h +; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_neg_b_hi_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_xor_b16 v1.h, 0x8000, v1.h +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_neg_b_hi_dual: +; GFX12: ; %bb.0: +; GFX12: v_xor_b16 v1.h, 0x8000, v1.h +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %b_hi = extractelement <2 x half> %b, i32 1 %neg.b_hi = fneg half %b_hi %neg_hi.b = insertelement <2 x half> %b, half %neg.b_hi, i32 1 @@ -859,33 +995,39 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_c_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1] -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_neg_c_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1] +; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX10-LABEL: v_fdot2_neg_c_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1] +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_c_dual: ; GFX11: ; %bb.0: -; GFX11: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1] +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX11: v_dot2acc_f32_f16 v5, v3, v4 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_add_f32_e32 v0, v0, v5 ; -; GFX1170-GFX12-LABEL: v_fdot2_neg_c_dual: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1] -; GFX1170-GFX12: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX1170-GFX12: v_add_f32_e32 v0, v2, v5 +; GFX1170-LABEL: v_fdot2_neg_c_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_neg_c_dual: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %neg.c = fneg float %c %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -896,33 +1038,39 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_abs_c_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1] -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_abs_c_dual: ; GFX950: ; %bb.0: -; GFX950: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1] +; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX10-LABEL: v_fdot2_abs_c_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1] +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_abs_c_dual: ; GFX11: ; %bb.0: -; GFX11: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1] +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX11: v_dot2acc_f32_f16 v5, v3, v4 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_add_f32_e32 v0, v0, v5 ; -; GFX1170-GFX12-LABEL: v_fdot2_abs_c_dual: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1] -; GFX1170-GFX12: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX1170-GFX12: v_add_f32_e32 v0, v2, v5 +; GFX1170-LABEL: v_fdot2_abs_c_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_abs_c_dual: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %abs.c = call float @llvm.fabs.f32(float %c) %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -933,9 +1081,9 @@ define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_opsel_lo_a_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[1,0,0] -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0] +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_opsel_lo_a_dual: ; GFX950: ; %bb.0: @@ -947,15 +1095,29 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_lo_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[1,0,0] +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; -; GFX11PLUS-LABEL: v_fdot2_opsel_lo_a_dual: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_mov_b16_e32 v0.l, v0.h -; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_opsel_lo_a_dual: +; GFX11: ; %bb.0: +; GFX11: v_mov_b16_e32 v0.l, v0.h +; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_opsel_lo_a_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_mov_b16_e32 v0.l, v0.h +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_opsel_lo_a_dual: +; GFX12: ; %bb.0: +; GFX12: v_mov_b16_e32 v0.l, v0.h +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -966,9 +1128,9 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_opsel_hi_a_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[0,1,1] -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1] +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_opsel_hi_a_dual: ; GFX950: ; %bb.0: @@ -980,15 +1142,29 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_hi_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[0,1,1] +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; -; GFX11PLUS-LABEL: v_fdot2_opsel_hi_a_dual: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_mov_b16_e32 v0.h, v0.l -; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_opsel_hi_a_dual: +; GFX11: ; %bb.0: +; GFX11: v_mov_b16_e32 v0.h, v0.l +; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_opsel_hi_a_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_mov_b16_e32 v0.h, v0.l +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_opsel_hi_a_dual: +; GFX12: ; %bb.0: +; GFX12: v_mov_b16_e32 v0.h, v0.l +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -999,9 +1175,9 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_opsel_lo_b_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[0,1,0] -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0] +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_opsel_lo_b_dual: ; GFX950: ; %bb.0: @@ -1013,15 +1189,29 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_lo_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[0,1,0] +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; -; GFX11PLUS-LABEL: v_fdot2_opsel_lo_b_dual: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_mov_b16_e32 v1.l, v1.h -; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_opsel_lo_b_dual: +; GFX11: ; %bb.0: +; GFX11: v_mov_b16_e32 v1.l, v1.h +; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_opsel_lo_b_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_mov_b16_e32 v1.l, v1.h +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_opsel_lo_b_dual: +; GFX12: ; %bb.0: +; GFX12: v_mov_b16_e32 v1.l, v1.h +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -1032,9 +1222,9 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_opsel_hi_b_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[1,0,1] -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1] +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_opsel_hi_b_dual: ; GFX950: ; %bb.0: @@ -1046,15 +1236,29 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_hi_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[1,0,1] +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; -; GFX11PLUS-LABEL: v_fdot2_opsel_hi_b_dual: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_mov_b16_e32 v1.h, v1.l -; GFX11PLUS: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_opsel_hi_b_dual: +; GFX11: ; %bb.0: +; GFX11: v_mov_b16_e32 v1.h, v1.l +; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_opsel_hi_b_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_mov_b16_e32 v1.h, v1.l +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_opsel_hi_b_dual: +; GFX12: ; %bb.0: +; GFX12: v_mov_b16_e32 v1.h, v1.l +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) @@ -1068,9 +1272,9 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 define float @v_fdot2_inline_literal_a_x(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_inline_literal_a_x: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, 2.0, v1, v2 op_sel_hi:[0,1,1] -; GFX906: v_dot2_f32_f16 v5, v3, v4, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, 2.0, v1, v2 op_sel_hi:[0,1,1] +; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_inline_literal_a_x: ; GFX950: ; %bb.0: @@ -1084,10 +1288,22 @@ define float @v_fdot2_inline_literal_a_x(<2 x half> %a, <2 x half> %b, float %c, ; GFX10: v_dot2c_f32_f16 v5, v3, v4 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11PLUS-LABEL: v_fdot2_inline_literal_a_x: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_f16 v5, v3, v4 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_inline_literal_a_x: +; GFX11: ; %bb.0: +; GFX11: v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_inline_literal_a_x: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, 0x40004000, v1, v2 +; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_inline_literal_a_x: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, 0x40004000, v1, v2 +; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> , <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -1097,9 +1313,9 @@ define float @v_fdot2_inline_literal_a_x(<2 x half> %a, <2 x half> %b, float %c, define float @v_fdot2_inline_literal_a_y(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_inline_literal_a_y: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, v1, v2 -; GFX906: v_dot2_f32_f16 v5, 2.0, v4, v5 op_sel_hi:[0,1,1] -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX906: v_dot2_f32_f16 v1, 2.0, v4, v5 op_sel_hi:[0,1,1] +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_inline_literal_a_y: ; GFX950: ; %bb.0: @@ -1113,10 +1329,22 @@ define float @v_fdot2_inline_literal_a_y(<2 x half> %a, <2 x half> %b, float %c, ; GFX10: v_dot2c_f32_f16 v5, 0x40004000, v4 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11PLUS-LABEL: v_fdot2_inline_literal_a_y: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v4 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_inline_literal_a_y: +; GFX11: ; %bb.0: +; GFX11: v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v4 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_inline_literal_a_y: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170: v_dot2_f32_f16 v1, 0x40004000, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_inline_literal_a_y: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX12: v_dot2_f32_f16 v1, 0x40004000, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> , <2 x half> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -1126,9 +1354,9 @@ define float @v_fdot2_inline_literal_a_y(<2 x half> %a, <2 x half> %b, float %c, define float @v_fdot2_inline_literal_a_xy(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_inline_literal_a_xy: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, 2.0, v1, v2 op_sel_hi:[0,1,1] -; GFX906: v_dot2_f32_f16 v5, 2.0, v4, v5 op_sel_hi:[0,1,1] -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, 2.0, v1, v2 op_sel_hi:[0,1,1] +; GFX906: v_dot2_f32_f16 v1, 2.0, v4, v5 op_sel_hi:[0,1,1] +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_inline_literal_a_xy: ; GFX950: ; %bb.0: @@ -1142,10 +1370,22 @@ define float @v_fdot2_inline_literal_a_xy(<2 x half> %a, <2 x half> %b, float %c ; GFX10: v_dot2c_f32_f16 v5, 0x40004000, v4 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11PLUS-LABEL: v_fdot2_inline_literal_a_xy: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v4 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_inline_literal_a_xy: +; GFX11: ; %bb.0: +; GFX11: v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v4 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_inline_literal_a_xy: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, 0x40004000, v1, v2 +; GFX1170: v_dot2_f32_f16 v1, 0x40004000, v4, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_inline_literal_a_xy: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, 0x40004000, v1, v2 +; GFX12: v_dot2_f32_f16 v1, 0x40004000, v4, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> , <2 x half> %b, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> , <2 x half> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -1155,9 +1395,9 @@ define float @v_fdot2_inline_literal_a_xy(<2 x half> %a, <2 x half> %b, float %c define float @v_fdot2_inline_literal_b_x(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_inline_literal_b_x: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, 2.0, v2 op_sel_hi:[1,0,1] -; GFX906: v_dot2_f32_f16 v5, v4, v3, v5 -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1] +; GFX906: v_dot2_f32_f16 v1, v4, v3, v5 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_inline_literal_b_x: ; GFX950: ; %bb.0: @@ -1171,10 +1411,22 @@ define float @v_fdot2_inline_literal_b_x(<2 x half> %a, <2 x half> %b, float %c, ; GFX10: v_dot2c_f32_f16 v5, v4, v3 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_x: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v5, v4, v3 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_inline_literal_b_x: +; GFX11: ; %bb.0: +; GFX11: v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v5, v4, v3 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_inline_literal_b_x: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, v0, 0x40004000, v2 +; GFX1170: v_dot2_f32_f16 v1, v4, v3, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_inline_literal_b_x: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, v0, 0x40004000, v2 +; GFX12: v_dot2_f32_f16 v1, v4, v3, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> , float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %e, <2 x half> %d, float %f, i1 false) %r = fadd float %r0, %r1 @@ -1184,9 +1436,9 @@ define float @v_fdot2_inline_literal_b_x(<2 x half> %a, <2 x half> %b, float %c, define float @v_fdot2_inline_literal_b_y(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_inline_literal_b_y: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v1, v0, v2 -; GFX906: v_dot2_f32_f16 v5, v3, 2.0, v5 op_sel_hi:[1,0,1] -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v1, v0, v2 +; GFX906: v_dot2_f32_f16 v1, v3, 2.0, v5 op_sel_hi:[1,0,1] +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_inline_literal_b_y: ; GFX950: ; %bb.0: @@ -1200,10 +1452,22 @@ define float @v_fdot2_inline_literal_b_y(<2 x half> %a, <2 x half> %b, float %c, ; GFX10: v_dot2c_f32_f16 v5, 0x40004000, v3 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_y: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, v1, v0 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v3 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_inline_literal_b_y: +; GFX11: ; %bb.0: +; GFX11: v_dual_dot2acc_f32_f16 v2, v1, v0 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v3 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_inline_literal_b_y: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, v1, v0, v2 +; GFX1170: v_dot2_f32_f16 v1, v3, 0x40004000, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_inline_literal_b_y: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, v1, v0, v2 +; GFX12: v_dot2_f32_f16 v1, v3, 0x40004000, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %b, <2 x half> %a, float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> , float %f, i1 false) %r = fadd float %r0, %r1 @@ -1213,9 +1477,9 @@ define float @v_fdot2_inline_literal_b_y(<2 x half> %a, <2 x half> %b, float %c, define float @v_fdot2_inline_literal_b_xy(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_inline_literal_b_xy: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v2, v0, 2.0, v2 op_sel_hi:[1,0,1] -; GFX906: v_dot2_f32_f16 v5, v3, 2.0, v5 op_sel_hi:[1,0,1] -; GFX906: v_add_f32_e32 v0, v2, v5 +; GFX906: v_dot2_f32_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1] +; GFX906: v_dot2_f32_f16 v1, v3, 2.0, v5 op_sel_hi:[1,0,1] +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_inline_literal_b_xy: ; GFX950: ; %bb.0: @@ -1229,10 +1493,22 @@ define float @v_fdot2_inline_literal_b_xy(<2 x half> %a, <2 x half> %b, float %c ; GFX10: v_dot2c_f32_f16 v5, 0x40004000, v3 ; GFX10: v_add_f32_e32 v0, v2, v5 ; -; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_xy: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v3 -; GFX11PLUS: v_add_f32_e32 v0, v2, v5 +; GFX11-LABEL: v_fdot2_inline_literal_b_xy: +; GFX11: ; %bb.0: +; GFX11: v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v3 +; GFX11: v_add_f32_e32 v0, v2, v5 +; +; GFX1170-LABEL: v_fdot2_inline_literal_b_xy: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, v0, 0x40004000, v2 +; GFX1170: v_dot2_f32_f16 v1, v3, 0x40004000, v5 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_inline_literal_b_xy: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, v0, 0x40004000, v2 +; GFX12: v_dot2_f32_f16 v1, v3, 0x40004000, v5 +; GFX12: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> , float %c, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> , float %f, i1 false) %r = fadd float %r0, %r1 @@ -1243,8 +1519,8 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> %a, <2 x half> %b, <2 x h ; GFX906-LABEL: v_fdot2_inline_literal_c_dual: ; GFX906: ; %bb.0: ; GFX906: v_dot2_f32_f16 v0, v0, v1, 2.0 -; GFX906: v_dot2_f32_f16 v4, v2, v3, v4 -; GFX906: v_add_f32_e32 v0, v0, v4 +; GFX906: v_dot2_f32_f16 v1, v2, v3, v4 +; GFX906: v_add_f32_e32 v0, v0, v1 ; ; GFX950-LABEL: v_fdot2_inline_literal_c_dual: ; GFX950: ; %bb.0: @@ -1266,11 +1542,17 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> %a, <2 x half> %b, <2 x h ; GFX11: v_dot2acc_f32_f16 v5, v0, v1 ; GFX11: v_add_f32_e32 v0, v5, v4 ; -; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_c_dual: -; GFX1170-GFX12: ; %bb.0: -; GFX1170-GFX12: v_dot2_f32_f16 v0, v0, v1, 2.0 -; GFX1170-GFX12: v_dot2_f32_f16 v4, v2, v3, v4 -; GFX1170-GFX12: v_add_f32_e32 v0, v0, v4 +; GFX1170-LABEL: v_fdot2_inline_literal_c_dual: +; GFX1170: ; %bb.0: +; GFX1170: v_dot2_f32_f16 v0, v0, v1, 2.0 +; GFX1170: v_dot2_f32_f16 v1, v2, v3, v4 +; GFX1170: v_add_f32_e32 v0, v0, v1 +; +; GFX12-LABEL: v_fdot2_inline_literal_c_dual: +; GFX12: ; %bb.0: +; GFX12: v_dot2_f32_f16 v0, v0, v1, 2.0 +; GFX12: v_dot2_f32_f16 v1, v2, v3, v4 +; GFX12: v_add_f32_e32 v0, v0, v1 %r0 = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 2.0, i1 false) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false) %r = fadd float %r0, %r1 @@ -1280,9 +1562,9 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> %a, <2 x half> %b, <2 x h define float @v_fdot2_clamp_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GCN-LABEL: v_fdot2_clamp_dual: ; GCN: ; %bb.0: -; GCN: v_dot2_f32_f16 v2, v0, v1, v2 clamp -; GCN: v_dot2_f32_f16 v5, v3, v4, v5 clamp -; GCN: v_add_f32_e32 v0, v2, v5 +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 clamp +; GCN: v_dot2_f32_f16 v1, v3, v4, v5 clamp +; GCN: v_add_f32_e32 v0, v0, v1 %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 true) %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 true) %r = fadd float %r0, %r1