Revert "AMDGPU: Codegen for v_dual_dot2acc_f32_f16/bf16 from VOP3" (#190159)

This reverts commit 47f6a19181b426baa03182ab6a7a41e16b35301d.
Breaks MIOpen, don't have propper fix yet.
This commit is contained in:
Petar Avramovic 2026-04-02 16:05:08 +02:00 committed by GitHub
parent 75359e1e1f
commit 5226289b8e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 601 additions and 353 deletions

View File

@ -34,36 +34,6 @@ using namespace llvm;
#define DEBUG_TYPE "gcn-vopd-utils"
// Check if MI is a VOP3P instruction with operands that satisfy the constraints
// for mapping it to a VOP2/VOPD opcode: no modifiers, no clamp, src1 and src2
// are registers (src0 can be register or literal), and src2 is same as dst.
static bool canMapVOP3PToVOPD(const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
if (Opc != AMDGPU::V_DOT2_F32_F16 && Opc != AMDGPU::V_DOT2_F32_BF16)
return false;
// src0 can be register or literal
int16_t Src0ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
if (MI.getOperand(Src0ModsIdx).getImm() != SISrcMods::OP_SEL_1)
return false;
int16_t Src1ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
if (MI.getOperand(Src1ModsIdx).getImm() != SISrcMods::OP_SEL_1)
return false;
int16_t Src1Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
if (!MI.getOperand(Src1Idx).isReg())
return false;
int16_t Src2ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers);
if (MI.getOperand(Src2ModsIdx).getImm() != SISrcMods::OP_SEL_1)
return false;
int16_t Src2Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (!MI.getOperand(Src2Idx).isReg())
return false;
int16_t ClampIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::clamp);
if (MI.getOperand(ClampIdx).getImm() != 0)
return false;
int16_t VdstIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
return MI.getOperand(VdstIdx).getReg() == MI.getOperand(Src2Idx).getReg();
}
bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
const MachineInstr &MIX,
const MachineInstr &MIY, bool IsVOPD3) {
@ -74,8 +44,7 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
if (IsVOPD3 && !ST.hasVOPD3())
return false;
if (!IsVOPD3 && ((TII.isVOP3(MIX) && !canMapVOP3PToVOPD(MIX)) ||
(TII.isVOP3(MIY) && !canMapVOP3PToVOPD(MIY))))
if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY)))
return false;
if (TII.isDPP(MIX) || TII.isDPP(MIY))
return false;

View File

@ -7012,14 +7012,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.getOperand(0).setReg(OriginalExec);
return BB;
}
case AMDGPU::V_DOT2_F32_F16:
case AMDGPU::V_DOT2_F32_BF16: {
// Hint RA to assign dst and src2 the same physical register.
// For targets without VOP2, but with VOPD, variant of the instruction this
// is one of the conditions to attempt converting VOP3P to VOPD.
MRI.setSimpleHint(MI.getOperand(0).getReg(), MI.getOperand(6).getReg());
return BB;
}
default:
if (TII->isImage(MI) || TII->isMUBUF(MI)) {
if (!MI.mayStore())

View File

@ -928,12 +928,6 @@ ComponentProps::ComponentProps(const MCInstrDesc &OpDesc, bool VOP3Layout) {
NumVOPD3Mods = 2;
if (IsVOP3)
SrcOperandsNum = 3;
} else if (Opcode == AMDGPU::V_DOT2_F32_F16 ||
Opcode == AMDGPU::V_DOT2_F32_BF16) {
// VOP3P opcodes that have VOPD but don't have VOP2 version. Using VOPD3
// path in getIndexOfSrcInMCOperands to get correct src operand indexes,
// but generating VOPD, not VOPD3.
NumVOPD3Mods = SrcOperandsNum;
} else if (isSISrcFPOperand(OpDesc,
getNamedOperandIdx(Opcode, OpName::src0))) {
// All FP VOPD instructions have Neg modifiers for all operands except

View File

@ -87,13 +87,11 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
}
multiclass VOP3PInstDotWithDual<string OpName, VOPProfile P,
SDPatternOperator node = null_frag,
bits<6> VOPDOp, string VOPDName> {
SDPatternOperator node = null_frag> {
def NAME : VOP3P_Pseudo<OpName, P,
getVOP3PModPat<P, node,
1 /*HasExplicitClamp*/, 1/*IsDOT*/,
VOP3PModsDOT, VOP3PModsF32>.ret>,
VOPD_Component<VOPDOp, VOPDName>;
VOP3PModsDOT, VOP3PModsF32>.ret>;
let SubtargetPredicate = isGFX11Plus in {
if P.HasExtVOP3DPP then
def _dpp : VOP3_DPP_Pseudo<OpName, P> {
@ -614,12 +612,12 @@ defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
} // End OtherPredicates = [HasDot2Insts]
let OtherPredicates = [HasDot10Insts], isCommutable = 1, usesCustomInserter = 1 in
let OtherPredicates = [HasDot10Insts] in
defm V_DOT2_F32_F16 :
VOP3PInstDotWithDual<"v_dot2_f32_f16",
VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR,
/*HasDPP*/ 1>,
AMDGPUfdot2, 0xC, "v_dot2acc_f32_f16">;
AMDGPUfdot2>;
let OtherPredicates = [HasDot7Insts] in {
defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
@ -642,10 +640,9 @@ def DOT2_BF16_Profile
let SubtargetPredicate = HasDot12Insts in {
let isCommutable = 1, usesCustomInserter = 1 in
defm V_DOT2_F32_BF16 :
VOP3PInstDotWithDual<"v_dot2_f32_bf16", DOT2_BF16_Profile,
int_amdgcn_fdot2_f32_bf16, 0xD, "v_dot2acc_f32_bf16">;
int_amdgcn_fdot2_f32_bf16>;
} // End SubtargetPredicate = HasDot12Insts

View File

@ -34,8 +34,8 @@ class VOP <string opName> {
string OpName = opName;
}
// First 13 insts from VOPDY are also VOPDX.
defvar VOPDX_Max_Index = 13;
// First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted
defvar VOPDX_Max_Index = 12;
defvar VOPD3X_Max_Index = 36;
class VOPD_Component<bits<6> OpIn, string vOPDName> {

View File

@ -225,7 +225,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b(<2 x bfloat> %a, float %c) {
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v0, v1
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v1
%ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
ret float %ret
}
@ -373,7 +373,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b_clamp(<2 x bfloat> %a, float %c)
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_clamp:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v0, v1 clamp
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v1 clamp
%ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 true)
ret float %ret
}
@ -395,8 +395,9 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c,
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, v3, v4
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
%r = fadd float %r0, %r1
@ -406,15 +407,15 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c,
define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_dual:
; GFX950: ; %bb.0:
; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
; GFX950: v_add_f32_e32 v0, v2, v5
; GFX950: v_add_f32_e32 v0, v0, v5
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%neg.a = fneg <2 x bfloat> %a
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@ -435,8 +436,9 @@ define float @v_fdot2_f32_bf16_neg_a_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_xor_b16 v0.l, 0x8000, v0.l
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%a_lo = extractelement <2 x bfloat> %a, i32 0
%neg.a_lo = fneg bfloat %a_lo
%neg_lo.a = insertelement <2 x bfloat> %a, bfloat %neg.a_lo, i32 0
@ -460,8 +462,9 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_xor_b16 v0.h, 0x8000, v0.h
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%a_hi = extractelement <2 x bfloat> %a, i32 1
%neg.a_hi = fneg bfloat %a_hi
%neg_hi.a = insertelement <2 x bfloat> %a, bfloat %neg.a_hi, i32 1
@ -474,15 +477,15 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_dual:
; GFX950: ; %bb.0:
; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
; GFX950: v_add_f32_e32 v0, v2, v5
; GFX950: v_add_f32_e32 v0, v0, v5
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%neg.b = fneg <2 x bfloat> %b
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@ -503,8 +506,9 @@ define float @v_fdot2_f32_bf16_neg_b_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_xor_b16 v1.l, 0x8000, v1.l
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%b_lo = extractelement <2 x bfloat> %b, i32 0
%neg.b_lo = fneg bfloat %b_lo
%neg_lo.b = insertelement <2 x bfloat> %b, bfloat %neg.b_lo, i32 0
@ -528,8 +532,9 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_xor_b16 v1.h, 0x8000, v1.h
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%b_hi = extractelement <2 x bfloat> %b, i32 1
%neg.b_hi = fneg bfloat %b_hi
%neg_hi.b = insertelement <2 x bfloat> %b, bfloat %neg.b_hi, i32 1
@ -542,15 +547,15 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
; GFX950-LABEL: v_fdot2_f32_bf16_neg_c_dual:
; GFX950: ; %bb.0:
; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1]
; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
; GFX950: v_add_f32_e32 v0, v2, v5
; GFX950: v_add_f32_e32 v0, v0, v5
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1]
; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%neg.c = fneg float %c
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@ -561,15 +566,15 @@ define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa
define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
; GFX950-LABEL: v_fdot2_f32_bf16_abs_c_dual:
; GFX950: ; %bb.0:
; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1]
; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
; GFX950: v_add_f32_e32 v0, v2, v5
; GFX950: v_add_f32_e32 v0, v0, v5
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1]
; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%abs.c = call float @llvm.fabs.f32(float %c)
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@ -589,8 +594,9 @@ define float @v_fdot2_f32_bf16_opsel_lo_a_dual(<2 x bfloat> %a, <2 x bfloat> %b,
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_mov_b16_e32 v0.l, v0.h
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@ -610,8 +616,9 @@ define float @v_fdot2_f32_bf16_opsel_hi_a_dual(<2 x bfloat> %a, <2 x bfloat> %b,
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_mov_b16_e32 v0.h, v0.l
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@ -631,8 +638,9 @@ define float @v_fdot2_f32_bf16_opsel_lo_b_dual(<2 x bfloat> %a, <2 x bfloat> %b,
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_mov_b16_e32 v1.l, v1.h
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@ -652,8 +660,9 @@ define float @v_fdot2_f32_bf16_opsel_hi_b_dual(<2 x bfloat> %a, <2 x bfloat> %b,
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_mov_b16_e32 v1.h, v1.l
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@ -673,8 +682,9 @@ define float @v_fdot2_f32_bf16_inline_literal_a_y(<2 x bfloat> %a, <2 x bfloat>
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_y:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, 0x40004000, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %e, float %f, i1 false)
%r = fadd float %r0, %r1
@ -690,8 +700,9 @@ define float @v_fdot2_f32_bf16_inline_literal_a_xy(<2 x bfloat> %a, <2 x bfloat>
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_xy:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v1, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, 0x40004000, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %b, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %e, float %f, i1 false)
%r = fadd float %r0, %r1
@ -707,8 +718,9 @@ define float @v_fdot2_f32_bf16_inline_literal_b_x(<2 x bfloat> %a, <2 x bfloat>
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_x:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_bf16 v5, v4, v3
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, v4, v3, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %e, <2 x bfloat> %d, float %f, i1 false)
%r = fadd float %r0, %r1
@ -724,8 +736,9 @@ define float @v_fdot2_f32_bf16_inline_literal_b_y(<2 x bfloat> %a, <2 x bfloat>
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_y:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v1, v0 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v3
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v1, v0, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, 0x40004000, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %b, <2 x bfloat> %a, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %f, i1 false)
%r = fadd float %r0, %r1
@ -741,8 +754,9 @@ define float @v_fdot2_f32_bf16_inline_literal_b_xy(<2 x bfloat> %a, <2 x bfloat>
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_xy:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v3
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v2
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, 0x40004000, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %f, i1 false)
%r = fadd float %r0, %r1
@ -760,8 +774,8 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x bfloat> %a, <2 x bfloa
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_c_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, 2.0
; GFX11PLUS: v_dot2_f32_bf16 v4, v2, v3, v4
; GFX11PLUS: v_add_f32_e32 v0, v0, v4
; GFX11PLUS: v_dot2_f32_bf16 v1, v2, v3, v4
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%r0 = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float 2.0, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
%r = fadd float %r0, %r1
@ -771,9 +785,9 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x bfloat> %a, <2 x bfloa
define float @v_fdot2_f32_bf16_clamp_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
; GCN-LABEL: v_fdot2_f32_bf16_clamp_dual:
; GCN: ; %bb.0:
; GCN: v_dot2_f32_bf16 v2, v0, v1, v2 clamp
; GCN: v_dot2_f32_bf16 v5, v3, v4, v5 clamp
; GCN: v_add_f32_e32 v0, v2, v5
; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
; GCN: v_dot2_f32_bf16 v1, v3, v4, v5 clamp
; GCN: v_add_f32_e32 v0, v0, v1
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 true)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 true)
%r = fadd float %r0, %r1

File diff suppressed because it is too large Load Diff