[AMDGPU] Implement v_mad_u32/v_mad_nc_u|i64_u32 on gfx1250 (#151226)

This commit is contained in:
Stanislav Mekhanoshin 2025-07-29 15:06:35 -07:00 committed by GitHub
parent 32127045c8
commit d99238263c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 419 additions and 81 deletions

View File

@ -1389,6 +1389,9 @@ def FeatureAddSubU64Insts
: SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
"Has v_add_u64 and v_sub_u64 instructions">;
def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst",
"true", "Has v_mad_u32 instruction">;
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
"HasVMemToLDSLoad",
"true",
@ -2049,6 +2052,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureVmemPrefInsts,
FeatureLshlAddU64Inst,
FeatureAddSubU64Insts,
FeatureMadU32Inst,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
]>;
@ -2839,6 +2843,9 @@ def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">,
AssemblerPredicate<(all_of FeatureMadU32Inst)>;
def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;

View File

@ -1134,15 +1134,26 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
SDLoc SL(N);
bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
unsigned Opc;
bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1);
if (Subtarget->hasMADIntraFwdBug())
Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
: AMDGPU::V_MAD_U64_U32_gfx11_e64;
else if (UseNoCarry)
Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
else
Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
Clamp };
if (UseNoCarry) {
MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
CurDAG->RemoveDeadNode(N);
return;
}
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}

View File

@ -574,13 +574,22 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
MRI->use_nodbg_empty(I.getOperand(1).getReg());
unsigned Opc;
if (Subtarget->hasMADIntraFwdBug())
Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
: AMDGPU::V_MAD_I64_I32_gfx11_e64;
else if (UseNoCarry)
Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
: AMDGPU::V_MAD_NC_I64_I32_e64;
else
Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
if (UseNoCarry)
I.removeOperand(1);
I.setDesc(TII.get(Opc));
I.addOperand(*MF, MachineOperand::CreateImm(0));
I.addImplicitDefUseOperands(*MF);

View File

@ -273,6 +273,7 @@ protected:
bool HasMinimum3Maximum3PKF16 = false;
bool HasLshlAddU64Inst = false;
bool HasAddSubU64Insts = false;
bool HasMadU32Inst = false;
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;
@ -1521,9 +1522,16 @@ public:
// \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
// \returns true if the target has V_MAD_U32 instruction.
bool hasMadU32Inst() const { return HasMadU32Inst; }
// \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
bool hasVectorMulU64() const { return GFX1250Insts; }
// \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
// instructions.
bool hasMadU64U32NoCarry() const { return GFX1250Insts; }
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }

View File

@ -57,6 +57,14 @@ class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> {
def V_LSHL_ADD_U64_PROF : VOP3_Profile<VOP_I64_I64_I32_I64>;
def VOP_F64_F64_F64_F64_DPP_PROF : VOP3_Profile<VOP_F64_F64_F64_F64>;
def V_MAD_U32_PROF: VOP3_Profile<VOP_I32_I32_I32_I32> {
let HasExtVOP3DPP = 0;
let HasExt64BitDPP = 1;
}
def VOP_I64_I64_I64_DPP : VOP3_Profile<VOP_I64_I64_I64>;
def VOP_I32_I32_I64_DPP : VOP3_Profile<VOPProfile<[i64, i32, i32, i64]>> {
let HasClamp = 1;
}
} // End HasExt64BitDPP = 1;
//===----------------------------------------------------------------------===//
@ -152,6 +160,15 @@ defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32
defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">;
defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
let SchedRW = [WriteIntMul] in {
let SubtargetPredicate = HasMadU32Inst in
defm V_MAD_U32 : VOP3Inst <"v_mad_u32", V_MAD_U32_PROF>;
let SubtargetPredicate = isGFX1250Plus in {
defm V_MAD_NC_U64_U32 : VOP3Inst<"v_mad_nc_u64_u32", VOP_I32_I32_I64_DPP>;
defm V_MAD_NC_I64_I32 : VOP3Inst<"v_mad_nc_i64_i32", VOP_I32_I32_I64_DPP>;
}
}
let SchedRW = [WriteDoubleAdd] in {
let FPDPRounding = 1 in {
defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">;
@ -848,6 +865,9 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
let SubtargetPredicate = HasMadU32Inst, AddedComplexity = 10 in
def : ThreeOp_i32_Pats<mul, add, V_MAD_U32_e64>;
def : GCNPat<
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
@ -1746,6 +1766,10 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>;
defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>;
defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>;
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">;
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;

View File

@ -801,15 +801,15 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mul_lo_u32 v0, v6, v5
; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1]
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
; GFX1250-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v11, v8
; GFX1250-NEXT: v_mul_lo_u32 v0, v7, v4
; GFX1250-NEXT: v_mad_u32 v5, v6, v5, v0
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v6, v3, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5
; GFX1250-NEXT: v_mov_b32_e32 v8, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v6, v4, v[10:11]
; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v3, v[4:5]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@ -1206,11 +1206,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null, v9, v5, v[0:1]
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v6, 0
; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null, v2, v4, v[10:11]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11]
; GFX1250-NEXT: v_mov_b32_e32 v12, v1
; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
@ -1220,15 +1220,13 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v8, v1, vcc_lo
; GFX1250-NEXT: v_mov_b32_e32 v1, v6
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v5, v[8:9]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_mov_b32_e32 v2, v7
; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v3, v4, v[8:9]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: v_mad_u32 v3, v3, v4, v1
; GFX1250-NEXT: v_mov_b32_e32 v1, v6
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i128 %num, %den
ret i128 %result
@ -2856,90 +2854,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v14, 0
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], null, v0, v12, 0
; GFX1250-NEXT: v_mul_lo_u32 v26, v6, v9
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v14, 0
; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v12, 0
; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v1, v13, v[16:17]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17]
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v2, v12, v[16:17]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v2, v12, v[16:17]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], null, v0, v10, 0
; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v0, v10, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v3, v11, v[16:17]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17]
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v4, v10, v[16:17]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v5, v9, v[16:17]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[16:17]
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
; GFX1250-NEXT: v_mov_b32_e32 v20, v19
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v24, vcc_lo
; GFX1250-NEXT: v_cndmask_b32_e64 v19, 0, 1, s0
; GFX1250-NEXT: v_mov_b32_e32 v21, v22
; GFX1250-NEXT: v_mul_lo_u32 v22, v5, v10
; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v2, v8, v[16:17]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v19, vcc_lo
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v0, v13, v[20:21]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v25
; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], vcc_lo, v1, v12, v[16:17]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v4, v10, v[16:17]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[16:17]
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9
; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[16:17]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21]
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21]
; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v8, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v8, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v27, v13, s2
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v6, v11, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2
; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v27, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v26, s0
; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v8, v[0:1]
; GFX1250-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v7, v8
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v0
; GFX1250-NEXT: v_mov_b32_e32 v0, v16
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
@ -3004,7 +3001,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX1250: ; %bb.0:
; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
@ -3195,7 +3192,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX1250: ; %bb.0:
; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4

View File

@ -1,9 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -< %s | FileCheck --check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -< %s | FileCheck --check-prefixes=GCN,GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -amdgpu-enable-delay-alu=0 -< %s | FileCheck --check-prefixes=GCN,GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -< %s | FileCheck --check-prefixes=GCN,GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 -< %s | FileCheck --check-prefixes=GCN,GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GFX9-LABEL: mad_i32_vvv:
@ -22,6 +23,11 @@ define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: v_mov_b32_e32 v4, v0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v3, v[2:3]
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_vvv:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mad_u32 v0, v0, v1, v2
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
%cast = bitcast i32 %add to float
@ -35,6 +41,34 @@ define amdgpu_ps float @mad_i32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
; GCN-NEXT: s_add_i32 s0, s0, s2
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: mad_i32_sss:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_add_i32 s0, s0, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: mad_i32_sss:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: s_add_i32 s0, s0, s2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: mad_i32_sss:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mul_i32 s0, s0, s1
; GFX11-NEXT: s_add_i32 s0, s0, s2
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_sss:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_mul_i32 s0, s0, s1
; GFX1250-NEXT: s_add_co_i32 s0, s0, s2
; GFX1250-NEXT: v_mov_b32_e32 v0, s0
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
%cast = bitcast i32 %add to float
@ -58,6 +92,11 @@ define amdgpu_ps float @mad_i32_vvc(i32 %a, i32 %b) {
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 42
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_vvc:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mad_u32 v0, v0, v1, 42
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, 42
%cast = bitcast i32 %add to float
@ -83,6 +122,11 @@ define amdgpu_ps float @mad_i32_vvi(i32 %a, i32 %b) {
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 0x12d687
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_vvi:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mad_u32 v0, v0, v1, 0x12d687
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, 1234567
%cast = bitcast i32 %add to float
@ -108,6 +152,11 @@ define amdgpu_ps float @mad_i32_vvi_neg(i32 %a, i32 %b) {
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 0xffffffffffed2979
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_vvi_neg:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mad_u32 v0, v0, v1, 0xffed2979
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, -1234567
%cast = bitcast i32 %add to float
@ -130,6 +179,11 @@ define amdgpu_ps float @mad_i32_vcv(i32 %a, i32 %c) {
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, 42, v[1:2]
; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_vcv:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mad_u32 v0, v0, 42, v1
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, 42
%add = add i32 %mul, %c
%cast = bitcast i32 %add to float
@ -152,6 +206,11 @@ define amdgpu_ps float @mad_i32_vcc(i32 %a) {
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, 42, 43
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_vcc:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mad_u32 v0, v0, 42, 43
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, 42
%add = add i32 %mul, 43
%cast = bitcast i32 %add to float
@ -175,6 +234,11 @@ define amdgpu_ps float @mad_i32_vvs(i32 %a, i32 %b, i32 inreg %c) {
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, s[0:1]
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_vvs:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mad_u32 v0, v0, v1, s0
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
%cast = bitcast i32 %add to float
@ -197,6 +261,11 @@ define amdgpu_ps float @mad_i32_vsv(i32 %a, i32 inreg %b, i32 %c) {
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, s0, v[1:2]
; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_vsv:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mad_u32 v0, v0, s0, v1
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
%cast = bitcast i32 %add to float
@ -219,6 +288,11 @@ define amdgpu_ps float @mad_i32_svv(i32 inreg %a, i32 %b, i32 %c) {
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, s0, v0, v[1:2]
; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_svv:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mad_u32 v0, s0, v0, v1
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
%cast = bitcast i32 %add to float
@ -244,6 +318,11 @@ define amdgpu_ps float @mad_i32_vss(i32 %a, i32 inreg %b, i32 inreg %c) {
; GFX11-NEXT: s_mov_b32 s2, s1
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, s0, s[2:3]
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_vss:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mad_u32 v0, v0, s0, s1
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
%cast = bitcast i32 %add to float
@ -269,6 +348,11 @@ define amdgpu_ps float @mad_i32_svs(i32 inreg %a, i32 %b, i32 inreg %c) {
; GFX11-NEXT: s_mov_b32 s2, s1
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_svs:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mad_u32 v0, s0, v0, s1
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
%cast = bitcast i32 %add to float
@ -292,6 +376,11 @@ define amdgpu_ps float @mad_i32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, s0, s1, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_ssv:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mad_u32 v0, s0, s1, v0
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
%cast = bitcast i32 %add to float
@ -322,6 +411,14 @@ define amdgpu_ps float @mad_i32_vvv_multiuse(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: flat_store_b32 v[0:1], v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1250-LABEL: mad_i32_vvv_multiuse:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mul_lo_u32 v1, v0, v1
; GFX1250-NEXT: v_add_nc_u32_e32 v0, v1, v2
; GFX1250-NEXT: flat_store_b32 v[0:1], v1 scope:SCOPE_SE
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
store i32 %mul, ptr poison

View File

@ -3548,28 +3548,27 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX1250-NEXT: s_wait_loadcnt 0x1
; GFX1250-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v10, v0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v8, v4
; GFX1250-NEXT: v_mul_u64_e32 v[6:7], v[0:1], v[6:7]
; GFX1250-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX1250-NEXT: v_mul_u64_e32 v[8:9], v[8:9], v[10:11]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], null, v2, v4, v[6:7]
; GFX1250-NEXT: v_mul_lo_u32 v2, v2, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_mad_nc_u64_u32 v[6:7], v2, v4, v[6:7]
; GFX1250-NEXT: v_mov_b32_e32 v10, v9
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], null, v5, v0, v[10:11]
; GFX1250-NEXT: v_add3_u32 v7, v3, v7, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13], v5, v0, v[10:11]
; GFX1250-NEXT: v_mad_u32 v0, v3, v4, v7
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_dual_mov_b32 v10, v13 :: v_dual_mov_b32 v13, v11
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], null, v4, v1, v[12:13]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_mad_u32 v7, v2, v5, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13], v4, v1, v[12:13]
; GFX1250-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v9, v12
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v14, v13
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[14:15]
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v1, v[10:11]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v1, v[10:11]
; GFX1250-NEXT: v_add_nc_u64_e32 v[10:11], v[0:1], v[6:7]
; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[2:3] scale_offset
; GFX1250-NEXT: s_endpgm

View File

@ -16,6 +16,57 @@ v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3]
v_lshl_add_u64 v[2:3], v[4:5], v7, 12345
// GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
v_mad_u32 v2, s4, v7, v8
// GFX1250: v_mad_u32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x0e,0x22,0x04]
v_mad_u32 v2, v4, 0, 1
// GFX1250: v_mad_u32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x01,0x05,0x02]
v_mad_u32 v2, v4, 3, s2
// GFX1250: v_mad_u32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x07,0x09,0x00]
v_mad_u32 v2, s4, 4, v2
// GFX1250: v_mad_u32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x08,0x09,0x04]
v_mad_u32 v2, v4, v7, 12345
// GFX1250: v_mad_u32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9]
// GFX1250: v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x0e,0x22,0x04]
v_mad_nc_u64_u32 v[2:3], v4, 0, 1
// GFX1250: v_mad_nc_u64_u32 v[2:3], v4, 0, 1 ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x01,0x05,0x02]
v_mad_nc_u64_u32 v[2:3], v4, 3, s[2:3]
// GFX1250: v_mad_nc_u64_u32 v[2:3], v4, 3, s[2:3] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x07,0x09,0x00]
v_mad_nc_u64_u32 v[2:3], s4, 4, v[2:3]
// GFX1250: v_mad_nc_u64_u32 v[2:3], s4, 4, v[2:3] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x08,0x09,0x04]
v_mad_nc_u64_u32 v[2:3], v4, v7, 12345
// GFX1250: v_mad_nc_u64_u32 v[2:3], v4, v7, 0x3039 ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] clamp
// GFX1250: v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfa,0xd6,0x04,0x0e,0x22,0x04]
v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9]
// GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x0e,0x22,0x04]
v_mad_nc_i64_i32 v[2:3], v4, 0, 1
// GFX1250: v_mad_nc_i64_i32 v[2:3], v4, 0, 1 ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x01,0x05,0x02]
v_mad_nc_i64_i32 v[2:3], v4, 3, s[2:3]
// GFX1250: v_mad_nc_i64_i32 v[2:3], v4, 3, s[2:3] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x07,0x09,0x00]
v_mad_nc_i64_i32 v[2:3], s4, 4, v[2:3]
// GFX1250: v_mad_nc_i64_i32 v[2:3], s4, 4, v[2:3] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x08,0x09,0x04]
v_mad_nc_i64_i32 v[2:3], v4, v7, 12345
// GFX1250: v_mad_nc_i64_i32 v[2:3], v4, v7, 0x3039 ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp
// GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04]
v_cvt_pk_bf16_f32 v5, v1, v2
// GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00]

View File

@ -16,6 +16,57 @@ v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3]
v_lshl_add_u64 v[2:3], v[4:5], v7, 12345
// GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
v_mad_u32 v2, s4, v7, v8
// GFX1250: v_mad_u32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x0e,0x22,0x04]
v_mad_u32 v2, v4, 0, 1
// GFX1250: v_mad_u32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x01,0x05,0x02]
v_mad_u32 v2, v4, 3, s2
// GFX1250: v_mad_u32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x07,0x09,0x00]
v_mad_u32 v2, s4, 4, v2
// GFX1250: v_mad_u32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x08,0x09,0x04]
v_mad_u32 v2, v4, v7, 12345
// GFX1250: v_mad_u32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9]
// GFX1250: v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x0e,0x22,0x04]
v_mad_nc_u64_u32 v[2:3], v4, 0, 1
// GFX1250: v_mad_nc_u64_u32 v[2:3], v4, 0, 1 ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x01,0x05,0x02]
v_mad_nc_u64_u32 v[2:3], v4, 3, s[2:3]
// GFX1250: v_mad_nc_u64_u32 v[2:3], v4, 3, s[2:3] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x07,0x09,0x00]
v_mad_nc_u64_u32 v[2:3], s4, 4, v[2:3]
// GFX1250: v_mad_nc_u64_u32 v[2:3], s4, 4, v[2:3] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x08,0x09,0x04]
v_mad_nc_u64_u32 v[2:3], v4, v7, 12345
// GFX1250: v_mad_nc_u64_u32 v[2:3], v4, v7, 0x3039 ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] clamp
// GFX1250: v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfa,0xd6,0x04,0x0e,0x22,0x04]
v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9]
// GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x0e,0x22,0x04]
v_mad_nc_i64_i32 v[2:3], v4, 0, 1
// GFX1250: v_mad_nc_i64_i32 v[2:3], v4, 0, 1 ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x01,0x05,0x02]
v_mad_nc_i64_i32 v[2:3], v4, 3, s[2:3]
// GFX1250: v_mad_nc_i64_i32 v[2:3], v4, 3, s[2:3] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x07,0x09,0x00]
v_mad_nc_i64_i32 v[2:3], s4, 4, v[2:3]
// GFX1250: v_mad_nc_i64_i32 v[2:3], s4, 4, v[2:3] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x08,0x09,0x04]
v_mad_nc_i64_i32 v[2:3], v4, v7, 12345
// GFX1250: v_mad_nc_i64_i32 v[2:3], v4, v7, 0x3039 ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp
// GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04]
v_cvt_pk_bf16_f32 v5, v1, v2
// GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00]

View File

@ -5,7 +5,40 @@ v_lshl_add_u64 v[2:3], v[4:5], v7, v[8:9] dpp8:[7,6,5,4,3,2,1,0]
// GFX125X-ERR-NEXT:{{^}}v_lshl_add_u64 v[2:3], v[4:5], v7, v[8:9] dpp8:[7,6,5,4,3,2,1,0]
// GFX125X-ERR-NEXT:{{^}} ^
v_mad_u32 v2, v4, v7, v8 dpp8:[7,6,5,4,3,2,1,0]
// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
// GFX125X-ERR-NEXT:{{^}}v_mad_u32 v2, v4, v7, v8 dpp8:[7,6,5,4,3,2,1,0]
// GFX125X-ERR-NEXT:{{^}} ^
v_mad_nc_u64_u32 v[4:5], v2, v5, v[6:7] dpp8:[7,6,5,4,3,2,1,0]
// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
// GFX125X-ERR-NEXT:{{^}}v_mad_nc_u64_u32 v[4:5], v2, v5, v[6:7] dpp8:[7,6,5,4,3,2,1,0]
// GFX125X-ERR-NEXT:{{^}} ^
v_mad_nc_i64_i32 v[4:5], v2, v5, v[6:7] dpp8:[7,6,5,4,3,2,1,0]
// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
// GFX125X-ERR-NEXT:{{^}}v_mad_nc_i64_i32 v[4:5], v2, v5, v[6:7] dpp8:[7,6,5,4,3,2,1,0]
// GFX125X-ERR-NEXT:{{^}} ^
v_lshl_add_u64 v[2:3], v[4:5], v7, v[8:9] quad_perm:[3,2,1,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
// GFX125X-ERR-NEXT:{{^}}v_lshl_add_u64 v[2:3], v[4:5], v7, v[8:9] quad_perm:[3,2,1,0]
// GFX125X-ERR-NEXT:{{^}} ^
v_mad_u32 v2, v4, v7, v8 quad_perm:[3,2,1,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
// GFX1251-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: DP ALU dpp only supports row_share
// GFX125X-ERR-NEXT:{{^}}v_mad_u32 v2, v4, v7, v8 quad_perm:[3,2,1,0]
// GFX125X-ERR-NEXT:{{^}} ^
v_mad_nc_u64_u32 v[4:5], v2, v5, v[6:7] quad_perm:[3,2,1,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
// GFX1251-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: DP ALU dpp only supports row_share
// GFX125X-ERR-NEXT:{{^}}v_mad_nc_u64_u32 v[4:5], v2, v5, v[6:7] quad_perm:[3,2,1,0]
// GFX125X-ERR-NEXT:{{^}} ^
v_mad_nc_i64_i32 v[4:5], v2, v5, v[6:7] quad_perm:[3,2,1,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
// GFX1251-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: DP ALU dpp only supports row_share
// GFX125X-ERR-NEXT:{{^}}v_mad_nc_i64_i32 v[4:5], v2, v5, v[6:7] quad_perm:[3,2,1,0]
// GFX125X-ERR-NEXT:{{^}} ^

View File

@ -17,6 +17,57 @@
0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00
# GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
0x02,0x00,0x35,0xd6,0x04,0x08,0x09,0x04
# GFX1250: v_mad_u32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x08,0x09,0x04]
0x02,0x00,0x35,0xd6,0x04,0x0e,0x22,0x04
# GFX1250: v_mad_u32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x0e,0x22,0x04]
0x02,0x00,0x35,0xd6,0x04,0x01,0x05,0x02
# GFX1250: v_mad_u32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x01,0x05,0x02]
0x02,0x00,0x35,0xd6,0x04,0x07,0x09,0x00
# GFX1250: v_mad_u32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x07,0x09,0x00]
0x02,0x00,0x35,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00
# GFX1250: v_mad_u32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
0x02,0x00,0xfa,0xd6,0x04,0x08,0x09,0x04
# GFX1250: v_mad_nc_u64_u32 v[2:3], s4, 4, v[2:3] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x08,0x09,0x04]
0x02,0x00,0xfa,0xd6,0x04,0x0e,0x22,0x04
# GFX1250: v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x0e,0x22,0x04]
0x02,0x00,0xfa,0xd6,0x04,0x01,0x05,0x02
# GFX1250: v_mad_nc_u64_u32 v[2:3], v4, 0, 1 ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x01,0x05,0x02]
0x02,0x00,0xfa,0xd6,0x04,0x07,0x09,0x00
# GFX1250: v_mad_nc_u64_u32 v[2:3], v4, 3, s[2:3] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x07,0x09,0x00]
0x02,0x00,0xfa,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00
# GFX1250: v_mad_nc_u64_u32 v[2:3], v4, v7, 0x3039 ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
0x02,0x80,0xfa,0xd6,0x04,0x0e,0x22,0x04
# GFX1250: v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfa,0xd6,0x04,0x0e,0x22,0x04]
0x02,0x00,0xfb,0xd6,0x04,0x08,0x09,0x04
# GFX1250: v_mad_nc_i64_i32 v[2:3], s4, 4, v[2:3] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x08,0x09,0x04]
0x02,0x00,0xfb,0xd6,0x04,0x0e,0x22,0x04
# GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x0e,0x22,0x04]
0x02,0x00,0xfb,0xd6,0x04,0x01,0x05,0x02
# GFX1250: v_mad_nc_i64_i32 v[2:3], v4, 0, 1 ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x01,0x05,0x02]
0x02,0x00,0xfb,0xd6,0x04,0x07,0x09,0x00
# GFX1250: v_mad_nc_i64_i32 v[2:3], v4, 3, s[2:3] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x07,0x09,0x00]
0x02,0x00,0xfb,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00
# GFX1250: v_mad_nc_i64_i32 v[2:3], v4, v7, 0x3039 ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04
# GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04]
0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf
# GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]