Revert "AMDGPU: Codegen for v_dual_dot2acc_f32_f16/bf16 from VOP3" (#190159)
This reverts commit 47f6a19181b426baa03182ab6a7a41e16b35301d. Breaks MIOpen, don't have propper fix yet.
This commit is contained in:
parent
75359e1e1f
commit
5226289b8e
@ -34,36 +34,6 @@ using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "gcn-vopd-utils"
|
||||
|
||||
// Check if MI is a VOP3P instruction with operands that satisfy the constraints
|
||||
// for mapping it to a VOP2/VOPD opcode: no modifiers, no clamp, src1 and src2
|
||||
// are registers (src0 can be register or literal), and src2 is same as dst.
|
||||
static bool canMapVOP3PToVOPD(const MachineInstr &MI) {
|
||||
unsigned Opc = MI.getOpcode();
|
||||
if (Opc != AMDGPU::V_DOT2_F32_F16 && Opc != AMDGPU::V_DOT2_F32_BF16)
|
||||
return false;
|
||||
// src0 can be register or literal
|
||||
int16_t Src0ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
|
||||
if (MI.getOperand(Src0ModsIdx).getImm() != SISrcMods::OP_SEL_1)
|
||||
return false;
|
||||
int16_t Src1ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
|
||||
if (MI.getOperand(Src1ModsIdx).getImm() != SISrcMods::OP_SEL_1)
|
||||
return false;
|
||||
int16_t Src1Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
|
||||
if (!MI.getOperand(Src1Idx).isReg())
|
||||
return false;
|
||||
int16_t Src2ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers);
|
||||
if (MI.getOperand(Src2ModsIdx).getImm() != SISrcMods::OP_SEL_1)
|
||||
return false;
|
||||
int16_t Src2Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
|
||||
if (!MI.getOperand(Src2Idx).isReg())
|
||||
return false;
|
||||
int16_t ClampIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::clamp);
|
||||
if (MI.getOperand(ClampIdx).getImm() != 0)
|
||||
return false;
|
||||
int16_t VdstIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
|
||||
return MI.getOperand(VdstIdx).getReg() == MI.getOperand(Src2Idx).getReg();
|
||||
}
|
||||
|
||||
bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
|
||||
const MachineInstr &MIX,
|
||||
const MachineInstr &MIY, bool IsVOPD3) {
|
||||
@ -74,8 +44,7 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
|
||||
|
||||
if (IsVOPD3 && !ST.hasVOPD3())
|
||||
return false;
|
||||
if (!IsVOPD3 && ((TII.isVOP3(MIX) && !canMapVOP3PToVOPD(MIX)) ||
|
||||
(TII.isVOP3(MIY) && !canMapVOP3PToVOPD(MIY))))
|
||||
if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY)))
|
||||
return false;
|
||||
if (TII.isDPP(MIX) || TII.isDPP(MIY))
|
||||
return false;
|
||||
|
||||
@ -7012,14 +7012,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
||||
MI.getOperand(0).setReg(OriginalExec);
|
||||
return BB;
|
||||
}
|
||||
case AMDGPU::V_DOT2_F32_F16:
|
||||
case AMDGPU::V_DOT2_F32_BF16: {
|
||||
// Hint RA to assign dst and src2 the same physical register.
|
||||
// For targets without VOP2, but with VOPD, variant of the instruction this
|
||||
// is one of the conditions to attempt converting VOP3P to VOPD.
|
||||
MRI.setSimpleHint(MI.getOperand(0).getReg(), MI.getOperand(6).getReg());
|
||||
return BB;
|
||||
}
|
||||
default:
|
||||
if (TII->isImage(MI) || TII->isMUBUF(MI)) {
|
||||
if (!MI.mayStore())
|
||||
|
||||
@ -928,12 +928,6 @@ ComponentProps::ComponentProps(const MCInstrDesc &OpDesc, bool VOP3Layout) {
|
||||
NumVOPD3Mods = 2;
|
||||
if (IsVOP3)
|
||||
SrcOperandsNum = 3;
|
||||
} else if (Opcode == AMDGPU::V_DOT2_F32_F16 ||
|
||||
Opcode == AMDGPU::V_DOT2_F32_BF16) {
|
||||
// VOP3P opcodes that have VOPD but don't have VOP2 version. Using VOPD3
|
||||
// path in getIndexOfSrcInMCOperands to get correct src operand indexes,
|
||||
// but generating VOPD, not VOPD3.
|
||||
NumVOPD3Mods = SrcOperandsNum;
|
||||
} else if (isSISrcFPOperand(OpDesc,
|
||||
getNamedOperandIdx(Opcode, OpName::src0))) {
|
||||
// All FP VOPD instructions have Neg modifiers for all operands except
|
||||
|
||||
@ -87,13 +87,11 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
|
||||
}
|
||||
|
||||
multiclass VOP3PInstDotWithDual<string OpName, VOPProfile P,
|
||||
SDPatternOperator node = null_frag,
|
||||
bits<6> VOPDOp, string VOPDName> {
|
||||
SDPatternOperator node = null_frag> {
|
||||
def NAME : VOP3P_Pseudo<OpName, P,
|
||||
getVOP3PModPat<P, node,
|
||||
1 /*HasExplicitClamp*/, 1/*IsDOT*/,
|
||||
VOP3PModsDOT, VOP3PModsF32>.ret>,
|
||||
VOPD_Component<VOPDOp, VOPDName>;
|
||||
VOP3PModsDOT, VOP3PModsF32>.ret>;
|
||||
let SubtargetPredicate = isGFX11Plus in {
|
||||
if P.HasExtVOP3DPP then
|
||||
def _dpp : VOP3_DPP_Pseudo<OpName, P> {
|
||||
@ -614,12 +612,12 @@ defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
|
||||
VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
|
||||
} // End OtherPredicates = [HasDot2Insts]
|
||||
|
||||
let OtherPredicates = [HasDot10Insts], isCommutable = 1, usesCustomInserter = 1 in
|
||||
let OtherPredicates = [HasDot10Insts] in
|
||||
defm V_DOT2_F32_F16 :
|
||||
VOP3PInstDotWithDual<"v_dot2_f32_f16",
|
||||
VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR,
|
||||
/*HasDPP*/ 1>,
|
||||
AMDGPUfdot2, 0xC, "v_dot2acc_f32_f16">;
|
||||
AMDGPUfdot2>;
|
||||
|
||||
let OtherPredicates = [HasDot7Insts] in {
|
||||
defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
|
||||
@ -642,10 +640,9 @@ def DOT2_BF16_Profile
|
||||
|
||||
let SubtargetPredicate = HasDot12Insts in {
|
||||
|
||||
let isCommutable = 1, usesCustomInserter = 1 in
|
||||
defm V_DOT2_F32_BF16 :
|
||||
VOP3PInstDotWithDual<"v_dot2_f32_bf16", DOT2_BF16_Profile,
|
||||
int_amdgcn_fdot2_f32_bf16, 0xD, "v_dot2acc_f32_bf16">;
|
||||
int_amdgcn_fdot2_f32_bf16>;
|
||||
|
||||
} // End SubtargetPredicate = HasDot12Insts
|
||||
|
||||
|
||||
@ -34,8 +34,8 @@ class VOP <string opName> {
|
||||
string OpName = opName;
|
||||
}
|
||||
|
||||
// First 13 insts from VOPDY are also VOPDX.
|
||||
defvar VOPDX_Max_Index = 13;
|
||||
// First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted
|
||||
defvar VOPDX_Max_Index = 12;
|
||||
defvar VOPD3X_Max_Index = 36;
|
||||
|
||||
class VOPD_Component<bits<6> OpIn, string vOPDName> {
|
||||
|
||||
@ -225,7 +225,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b(<2 x bfloat> %a, float %c) {
|
||||
;
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v0, v1
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v1
|
||||
%ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
|
||||
ret float %ret
|
||||
}
|
||||
@ -373,7 +373,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b_clamp(<2 x bfloat> %a, float %c)
|
||||
;
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_clamp:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v0, v1 clamp
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v1 clamp
|
||||
%ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 true)
|
||||
ret float %ret
|
||||
}
|
||||
@ -395,8 +395,9 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c,
|
||||
;
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, v3, v4
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
|
||||
%r = fadd float %r0, %r1
|
||||
@ -406,15 +407,15 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c,
|
||||
define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
|
||||
; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_dual:
|
||||
; GFX950: ; %bb.0:
|
||||
; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
|
||||
; GFX950: v_add_f32_e32 v0, v2, v5
|
||||
; GFX950: v_add_f32_e32 v0, v0, v5
|
||||
;
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%neg.a = fneg <2 x bfloat> %a
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
|
||||
@ -435,8 +436,9 @@ define float @v_fdot2_f32_bf16_neg_a_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_xor_b16 v0.l, 0x8000, v0.l
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%a_lo = extractelement <2 x bfloat> %a, i32 0
|
||||
%neg.a_lo = fneg bfloat %a_lo
|
||||
%neg_lo.a = insertelement <2 x bfloat> %a, bfloat %neg.a_lo, i32 0
|
||||
@ -460,8 +462,9 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_xor_b16 v0.h, 0x8000, v0.h
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%a_hi = extractelement <2 x bfloat> %a, i32 1
|
||||
%neg.a_hi = fneg bfloat %a_hi
|
||||
%neg_hi.a = insertelement <2 x bfloat> %a, bfloat %neg.a_hi, i32 1
|
||||
@ -474,15 +477,15 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
|
||||
define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
|
||||
; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_dual:
|
||||
; GFX950: ; %bb.0:
|
||||
; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
|
||||
; GFX950: v_add_f32_e32 v0, v2, v5
|
||||
; GFX950: v_add_f32_e32 v0, v0, v5
|
||||
;
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%neg.b = fneg <2 x bfloat> %b
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
|
||||
@ -503,8 +506,9 @@ define float @v_fdot2_f32_bf16_neg_b_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_xor_b16 v1.l, 0x8000, v1.l
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%b_lo = extractelement <2 x bfloat> %b, i32 0
|
||||
%neg.b_lo = fneg bfloat %b_lo
|
||||
%neg_lo.b = insertelement <2 x bfloat> %b, bfloat %neg.b_lo, i32 0
|
||||
@ -528,8 +532,9 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_xor_b16 v1.h, 0x8000, v1.h
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%b_hi = extractelement <2 x bfloat> %b, i32 1
|
||||
%neg.b_hi = fneg bfloat %b_hi
|
||||
%neg_hi.b = insertelement <2 x bfloat> %b, bfloat %neg.b_hi, i32 1
|
||||
@ -542,15 +547,15 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
|
||||
define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
|
||||
; GFX950-LABEL: v_fdot2_f32_bf16_neg_c_dual:
|
||||
; GFX950: ; %bb.0:
|
||||
; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1]
|
||||
; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
|
||||
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
|
||||
; GFX950: v_add_f32_e32 v0, v2, v5
|
||||
; GFX950: v_add_f32_e32 v0, v0, v5
|
||||
;
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1]
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%neg.c = fneg float %c
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
|
||||
@ -561,15 +566,15 @@ define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa
|
||||
define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
|
||||
; GFX950-LABEL: v_fdot2_f32_bf16_abs_c_dual:
|
||||
; GFX950: ; %bb.0:
|
||||
; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1]
|
||||
; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
|
||||
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
|
||||
; GFX950: v_add_f32_e32 v0, v2, v5
|
||||
; GFX950: v_add_f32_e32 v0, v0, v5
|
||||
;
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1]
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%abs.c = call float @llvm.fabs.f32(float %c)
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
|
||||
@ -589,8 +594,9 @@ define float @v_fdot2_f32_bf16_opsel_lo_a_dual(<2 x bfloat> %a, <2 x bfloat> %b,
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_mov_b16_e32 v0.l, v0.h
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
|
||||
@ -610,8 +616,9 @@ define float @v_fdot2_f32_bf16_opsel_hi_a_dual(<2 x bfloat> %a, <2 x bfloat> %b,
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_mov_b16_e32 v0.h, v0.l
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
|
||||
@ -631,8 +638,9 @@ define float @v_fdot2_f32_bf16_opsel_lo_b_dual(<2 x bfloat> %a, <2 x bfloat> %b,
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_mov_b16_e32 v1.l, v1.h
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
|
||||
@ -652,8 +660,9 @@ define float @v_fdot2_f32_bf16_opsel_hi_b_dual(<2 x bfloat> %a, <2 x bfloat> %b,
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_mov_b16_e32 v1.h, v1.l
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
|
||||
@ -673,8 +682,9 @@ define float @v_fdot2_f32_bf16_inline_literal_a_y(<2 x bfloat> %a, <2 x bfloat>
|
||||
;
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_y:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, 0x40004000, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %e, float %f, i1 false)
|
||||
%r = fadd float %r0, %r1
|
||||
@ -690,8 +700,9 @@ define float @v_fdot2_f32_bf16_inline_literal_a_xy(<2 x bfloat> %a, <2 x bfloat>
|
||||
;
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_xy:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v1, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, 0x40004000, v4, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %b, float %c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %e, float %f, i1 false)
|
||||
%r = fadd float %r0, %r1
|
||||
@ -707,8 +718,9 @@ define float @v_fdot2_f32_bf16_inline_literal_b_x(<2 x bfloat> %a, <2 x bfloat>
|
||||
;
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_x:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_bf16 v5, v4, v3
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v4, v3, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %e, <2 x bfloat> %d, float %f, i1 false)
|
||||
%r = fadd float %r0, %r1
|
||||
@ -724,8 +736,9 @@ define float @v_fdot2_f32_bf16_inline_literal_b_y(<2 x bfloat> %a, <2 x bfloat>
|
||||
;
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_y:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v1, v0 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v3
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v1, v0, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, 0x40004000, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %b, <2 x bfloat> %a, float %c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %f, i1 false)
|
||||
%r = fadd float %r0, %r1
|
||||
@ -741,8 +754,9 @@ define float @v_fdot2_f32_bf16_inline_literal_b_xy(<2 x bfloat> %a, <2 x bfloat>
|
||||
;
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_xy:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v3
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v2, v5
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v2
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, 0x40004000, v5
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %f, i1 false)
|
||||
%r = fadd float %r0, %r1
|
||||
@ -760,8 +774,8 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x bfloat> %a, <2 x bfloa
|
||||
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_c_dual:
|
||||
; GFX11PLUS: ; %bb.0:
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, 2.0
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v4, v2, v3, v4
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v4
|
||||
; GFX11PLUS: v_dot2_f32_bf16 v1, v2, v3, v4
|
||||
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
|
||||
%r0 = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float 2.0, i1 false)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
|
||||
%r = fadd float %r0, %r1
|
||||
@ -771,9 +785,9 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x bfloat> %a, <2 x bfloa
|
||||
define float @v_fdot2_f32_bf16_clamp_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
|
||||
; GCN-LABEL: v_fdot2_f32_bf16_clamp_dual:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN: v_dot2_f32_bf16 v2, v0, v1, v2 clamp
|
||||
; GCN: v_dot2_f32_bf16 v5, v3, v4, v5 clamp
|
||||
; GCN: v_add_f32_e32 v0, v2, v5
|
||||
; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
|
||||
; GCN: v_dot2_f32_bf16 v1, v3, v4, v5 clamp
|
||||
; GCN: v_add_f32_e32 v0, v0, v1
|
||||
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 true)
|
||||
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 true)
|
||||
%r = fadd float %r0, %r1
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user