[AMDGPU] Add gfx1250 cvt_pk|sr_fp8|bf8_f32 instructions (#151595)

This commit is contained in:
Stanislav Mekhanoshin 2025-07-31 16:04:46 -07:00 committed by GitHub
parent d3a9cde7b8
commit 49d89bc9f4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 580 additions and 33 deletions

View File

@ -707,6 +707,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f16, "sV2h", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f16, "sV2h", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f16, "ihiUiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f16, "ihiUiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts")

View File

@ -652,6 +652,60 @@ void test_prefetch(generic void *fptr, global void *gptr) {
__builtin_amdgcn_global_prefetch(gptr, 8);
}
// CHECK-LABEL: @test_cvt_pk_fp8_f32_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float [[TMP0]], float [[TMP1]], i32 [[TMP2]], i1 true)
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT: ret void
//
void test_cvt_pk_fp8_f32_e5m3(global int* out, int old, float a, float b)
{
*out = __builtin_amdgcn_cvt_pk_fp8_f32_e5m3(a, b, old, true);
}
// CHECK-LABEL: @test_cvt_sr_fp8_f32_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 3)
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT: ret void
//
void test_cvt_sr_fp8_f32_e5m3(global int* out, int old, float a, int b)
{
*out = __builtin_amdgcn_cvt_sr_fp8_f32_e5m3(a, b, old, 3);
}
// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)

View File

@ -3505,6 +3505,12 @@ def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">,
[llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
// llvm.amdgcn.cvt.pk.fp8.f32.e5m3 int vdst, float srcA, float srcB, int old, imm word_sel
def int_amdgcn_cvt_pk_fp8_f32_e5m3 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32_e5m3">,
DefaultAttrsIntrinsic<[llvm_i32_ty],
[llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
// byte_sel selects byte to write into vdst.
def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">,
@ -3518,6 +3524,12 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
// llvm.amdgcn.cvt.sr.fp8.f32.e5m3 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
def int_amdgcn_cvt_sr_fp8_f32_e5m3 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32_e5m3">,
DefaultAttrsIntrinsic<[llvm_i32_ty],
[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
// llvm.amdgcn.cvt.off.fp32.i4 int srcA
def int_amdgcn_cvt_off_f32_i4: ClangBuiltin<"__builtin_amdgcn_cvt_off_f32_i4">,
DefaultAttrsIntrinsic<[llvm_float_ty],

View File

@ -4633,8 +4633,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_pk_f32_fp8:
case Intrinsic::amdgcn_cvt_pk_f32_bf8:
case Intrinsic::amdgcn_cvt_pk_fp8_f32:
case Intrinsic::amdgcn_cvt_pk_fp8_f32_e5m3:
case Intrinsic::amdgcn_cvt_pk_bf8_f32:
case Intrinsic::amdgcn_cvt_sr_fp8_f32:
case Intrinsic::amdgcn_cvt_sr_fp8_f32_e5m3:
case Intrinsic::amdgcn_cvt_sr_bf8_f32:
case Intrinsic::amdgcn_cvt_sr_bf16_f32:
case Intrinsic::amdgcn_cvt_sr_f16_f32:

View File

@ -9366,6 +9366,10 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
}
}
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyClamp);
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in))
Inst.addOperand(Inst.getOperand(0));
@ -9373,10 +9377,6 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
AMDGPUOperand::ImmTyByteSel);
}
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyClamp);
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyOModSI);
@ -9430,6 +9430,8 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx1250_e64_dpp_gfx1250 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx1250_e64_dpp8_gfx1250 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp_gfx1250 ||
@ -10038,9 +10040,12 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyClamp);
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel))
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
if (VdstInIdx == static_cast<int>(Inst.getNumOperands()))
Inst.addOperand(Inst.getOperand(0));
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyByteSel);
}
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod))
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);

View File

@ -625,8 +625,9 @@ def shl_0_to_4 : PatFrag<
}];
}
def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
defvar Tail = (ins VGPR_32:$vdst_in, op_sel0:$op_sel);
class VOP3_CVT_PK_F8_F32_Profile<bit _HasClamp = 0> : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
defvar Tail = !con(!if(_HasClamp, (ins Clamp:$clamp), (ins)),
(ins VGPR_32:$vdst_in, op_sel0:$op_sel));
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
0, HasModifiers, HasSrc2Mods,
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
@ -636,12 +637,13 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
Src2ModVOP3DPP, false>.ret,
Tail);
let HasClamp = 0;
let HasClamp = _HasClamp;
let HasExtVOP3DPP = 1;
}
def VOP3_CVT_PK_F8_F32_Profile_fake16 : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP3_OPSEL> {
defvar Tail = (ins VGPR_32:$vdst_in, op_sel0:$op_sel);
class VOP3_CVT_PK_F8_F32_Profile_fake16<bit _HasClamp = 0> : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP3_OPSEL> {
defvar Tail = !con(!if(_HasClamp, (ins Clamp:$clamp), (ins)),
(ins VGPR_32:$vdst_in, op_sel0:$op_sel));
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
0, HasModifiers, HasSrc2Mods,
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
@ -651,14 +653,15 @@ def VOP3_CVT_PK_F8_F32_Profile_fake16 : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
Src2ModVOP3DPP, false>.ret,
Tail);
let HasClamp = 0;
let HasClamp = _HasClamp;
let HasExtVOP3DPP = 1;
}
// This t16 profile with vdst_in operand is for backward compatibility and is used
// for user controlled packing
def VOP3_CVT_PK_F8_F32_Profile_t16 : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_OPSEL> {
defvar Tail = (ins VGPR_16:$vdst_in, op_sel0:$op_sel);
class VOP3_CVT_PK_F8_F32_Profile_t16<bit _HasClamp = 0> : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_OPSEL> {
defvar Tail = !con(!if(_HasClamp, (ins Clamp:$clamp), (ins)),
(ins VGPR_16:$vdst_in, op_sel0:$op_sel));
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
0, HasModifiers, HasSrc2Mods,
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
@ -668,7 +671,7 @@ def VOP3_CVT_PK_F8_F32_Profile_t16 : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_O
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
Src2ModVOP3DPP, false>.ret,
Tail);
let HasClamp = 0;
let HasClamp = _HasClamp;
let HasExtVOP3DPP = 1;
}
@ -702,10 +705,10 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
HasModifiers, DstVT>.ret);
}
class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT, bit _HasClamp = 0> :
VOP3_Profile<VOPProfile<[i32, SrcVT, i32, untyped]>> {
let HasFP8DstByteSel = 1;
let HasClamp = 0;
let HasClamp = _HasClamp;
}
def IsPow2Plus1: PatLeaf<(i32 imm), [{
@ -780,15 +783,23 @@ defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", V_LSHL_ADD_U64_PROF>;
let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
SchedRW = [WriteFloatCvt] in {
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile,
VOP3_CVT_PK_F8_F32_Profile_t16,
VOP3_CVT_PK_F8_F32_Profile_fake16>;
defm V_CVT_PK_BF8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile,
VOP3_CVT_PK_F8_F32_Profile_t16,
VOP3_CVT_PK_F8_F32_Profile_fake16>;
let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile<>,
VOP3_CVT_PK_F8_F32_Profile_t16<>,
VOP3_CVT_PK_F8_F32_Profile_fake16<>>;
let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in
defm V_CVT_PK_FP8_F32_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32_gfx1250", VOP3_CVT_PK_F8_F32_Profile<true>,
VOP3_CVT_PK_F8_F32_Profile_t16<true>,
VOP3_CVT_PK_F8_F32_Profile_fake16<true>>;
defm V_CVT_PK_BF8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile<>,
VOP3_CVT_PK_F8_F32_Profile_t16<>,
VOP3_CVT_PK_F8_F32_Profile_fake16<>>;
let SubtargetPredicate = isGFX12Plus in {
defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in
defm V_CVT_SR_FP8_F32_gfx1250 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx1250", VOP3_CVT_SR_F8_ByteSel_Profile<f32, true>>;
defm V_CVT_SR_BF8_F32_gfx12 : VOP3Inst<"v_cvt_sr_bf8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
}
}
@ -807,6 +818,11 @@ class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : G
(inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0)
>;
class Cvt_PK_F8_F32_E5M3_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst, int Clamp> : GCNPat<
(i32 (node f32:$src0, f32:$src1, i32:$old, index)),
(inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, Clamp, $old, 0)
>;
multiclass Cvt_PK_F8_F32_t16_Pat<SDPatternOperator node, VOP3_Pseudo inst> {
def : GCNPat<
(i32 (node f32:$src0, f32:$src1, i32:$old, -1)),
@ -822,6 +838,21 @@ def : GCNPat<
>;
}
multiclass Cvt_PK_F8_F32_E5M3_t16_Pat<SDPatternOperator node, VOP3_Pseudo inst, int Clamp> {
def : GCNPat<
(i32 (node f32:$src0, f32:$src1, i32:$old, -1)),
(REG_SEQUENCE VGPR_32,
(i16 (EXTRACT_SUBREG $old, lo16)), lo16,
(i16 (inst SRCMODS.DST_OP_SEL, $src0, 0, $src1, Clamp, (i16 (EXTRACT_SUBREG $old, hi16)), 0)), hi16)
>;
def : GCNPat<
(i32 (node f32:$src0, f32:$src1, i32:$old, 0)),
(REG_SEQUENCE VGPR_32,
(i16 (inst 0, $src0, 0, $src1, Clamp, (i16 (EXTRACT_SUBREG $old, lo16)), 0)), lo16,
(i16 (EXTRACT_SUBREG $old, hi16)), hi16)
>;
}
class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
(i32 (node f32:$src0, i32:$src1, i32:$old, index)),
(inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
@ -834,21 +865,37 @@ class Cvt_SR_F8_ByteSel_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType
(inst $src0_modifiers, $src0, $src1_modifiers, $src1, $old, (as_i32timm $byte_sel))
>;
class Cvt_SR_F8_ByteSel_E5M3_Pat<SDPatternOperator node, VOP3_Pseudo inst,
ValueType SrcVT, int Clamp> : GCNPat<
(i32 (node (VOP3Mods SrcVT:$src0, i32:$src0_modifiers), (VOP3Mods i32:$src1, i32:$src1_modifiers),
i32:$old, timm:$byte_sel)),
(inst $src0_modifiers, $src0, $src1_modifiers, $src1, Clamp, $old, (as_i32timm $byte_sel))
>;
let OtherPredicates = [HasFP8ConversionInsts] in {
foreach Index = [0, -1] in {
let True16Predicate = NotHasTrue16BitInsts in {
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
}
let True16Predicate = UseFakeTrue16Insts in {
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_fake16_e64>;
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_fake16_e64>;
let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in {
def : Cvt_PK_F8_F32_E5M3_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_gfx1250_fake16_e64, DSTCLAMP.NONE>;
def : Cvt_PK_F8_F32_E5M3_Pat<int_amdgcn_cvt_pk_fp8_f32_e5m3, Index, V_CVT_PK_FP8_F32_gfx1250_fake16_e64, DSTCLAMP.ENABLE>;
}
}
}
let True16Predicate = UseRealTrue16Insts in {
defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_fp8_f32, V_CVT_PK_FP8_F32_t16_e64>;
defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_bf8_f32, V_CVT_PK_BF8_F32_t16_e64>;
let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in {
defm : Cvt_PK_F8_F32_E5M3_t16_Pat<int_amdgcn_cvt_pk_fp8_f32, V_CVT_PK_FP8_F32_gfx1250_t16_e64, DSTCLAMP.NONE>;
defm : Cvt_PK_F8_F32_E5M3_t16_Pat<int_amdgcn_cvt_pk_fp8_f32_e5m3, V_CVT_PK_FP8_F32_gfx1250_t16_e64, DSTCLAMP.ENABLE>;
}
}
let SubtargetPredicate = isGFX940Plus in {
@ -859,7 +906,12 @@ let SubtargetPredicate = isGFX940Plus in {
}
let SubtargetPredicate = isGFX12Plus in {
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>;
let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>;
let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in {
def : Cvt_SR_F8_ByteSel_E5M3_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx1250_e64, f32, DSTCLAMP.NONE>;
def : Cvt_SR_F8_ByteSel_E5M3_Pat<int_amdgcn_cvt_sr_fp8_f32_e5m3, V_CVT_SR_FP8_F32_gfx1250_e64, f32, DSTCLAMP.ENABLE>;
}
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f32, V_CVT_SR_BF8_F32_gfx12_e64, f32>;
}
}
@ -1892,11 +1944,6 @@ defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>;
defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>;
defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>;
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">;
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;
//===----------------------------------------------------------------------===//
// GFX11, GFX12
//===----------------------------------------------------------------------===//
@ -2057,6 +2104,13 @@ defm V_AND_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x36
defm V_OR_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x363, "v_or_b16">;
defm V_XOR_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x364, "v_xor_b16">;
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_not_gfx1250<0x369, "v_cvt_pk_fp8_f32">;
defm V_CVT_PK_FP8_F32_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x369, "v_cvt_pk_fp8_f32">;
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx11_gfx12_not_gfx1250<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32">;
defm V_CVT_SR_FP8_F32_gfx1250 : VOP3Only_Realtriple_with_name_gfx1250<0x36b, "V_CVT_SR_FP8_F32_gfx1250", "v_cvt_sr_fp8_f32">;
defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx11_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;
let AssemblerPredicate = isGFX11Plus in {
def : AMDGPUMnemonicAlias<"v_add3_nc_u32", "v_add3_u32">;
def : AMDGPUMnemonicAlias<"v_xor_add_u32", "v_xad_u32">;

View File

@ -2010,6 +2010,23 @@ multiclass VOP3_BITOP3_Real_Base<GFXGen Gen, bits<10> op, string asmName> {
}
}
multiclass VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<bits<10> op, string asmName, string opName = NAME,
string pseudo_mnemonic = "", bit isSingle = 0> :
VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
VOP3_Realtriple_with_name<GFX12Not12_50Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_not_gfx1250<bits<10> op, string asmName,
string opName = NAME, string pseudo_mnemonic = ""> {
defm _t16 : VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<op, asmName, opName#"_t16", pseudo_mnemonic, 1>;
defm _fake16 : VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<op, asmName, opName#"_fake16", pseudo_mnemonic, 1>;
}
multiclass VOP3_Realtriple_with_name_gfx11_gfx12_not_gfx1250<bits<10> op, string opName,
string asmName, string pseudo_mnemonic = "",
bit isSingle = 0> :
VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
VOP3_Realtriple_with_name<GFX12Not12_50Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
//===----------------------------------------------------------------------===//
// VOP3 GFX11
//===----------------------------------------------------------------------===//

View File

@ -1,10 +1,188 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
declare i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float, float, i32, i1)
declare i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float, i32, i32, i32)
declare float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32, i32)
define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) {
; GFX1250-TRUE16-LABEL: test_cvt_pk_fp8_f32_word0:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.l, v0, v1 clamp
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: test_cvt_pk_fp8_f32_word0:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 clamp
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v2
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-GISEL-LABEL: test_cvt_pk_fp8_f32_word0:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 clamp
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v2
; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float %x, float %y, i32 %old, i1 false)
ret i32 %ret
}
define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
; GFX1250-TRUE16-LABEL: test_cvt_pk_fp8_f32_word1:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.h, v0, v1 op_sel:[0,0,1] clamp
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: test_cvt_pk_fp8_f32_word1:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] clamp
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v2
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-GISEL-LABEL: test_cvt_pk_fp8_f32_word1:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] clamp
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v2
; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float %x, float %y, i32 %old, i1 true)
ret i32 %ret
}
define amdgpu_cs void @test_cvt_pk_fp8_f32_word1_dpp(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) {
; GFX1250-TRUE16-LABEL: test_cvt_pk_fp8_f32_word1_dpp:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.h, v0, v1 op_sel:[0,0,1] clamp
; GFX1250-TRUE16-NEXT: global_store_b32 v[4:5], v2, off
; GFX1250-TRUE16-NEXT: s_endpgm
;
; GFX1250-FAKE16-LABEL: test_cvt_pk_fp8_f32_word1_dpp:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] clamp
; GFX1250-FAKE16-NEXT: global_store_b32 v[4:5], v2, off
; GFX1250-FAKE16-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: test_cvt_pk_fp8_f32_word1_dpp:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] clamp
; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v2, off
; GFX1250-GISEL-NEXT: s_endpgm
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%tmp1 = bitcast i32 %tmp0 to float
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float %tmp1, float %y, i32 %old, i1 true)
store i32 %ret, ptr addrspace(1) %out
ret void
}
define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) {
; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte0:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 clamp
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v0, v2
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float %x, i32 %r, i32 %old, i32 0)
ret i32 %ret
}
define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte1:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:1 clamp
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v0, v2
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float %x, i32 %r, i32 %old, i32 1)
ret i32 %ret
}
define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) {
; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte2:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:2 clamp
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v0, v2
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float %x, i32 %r, i32 %old, i32 2)
ret i32 %ret
}
define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) {
; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte3:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:3 clamp
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v0, v2
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float %x, i32 %r, i32 %old, i32 3)
ret i32 %ret
}
define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1_dpp(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
; GFX1250-TRUE16-LABEL: test_cvt_sr_fp8_f32_byte1_dpp:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-TRUE16-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 clamp quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1250-TRUE16-NEXT: global_store_b32 v[4:5], v2, off
; GFX1250-TRUE16-NEXT: s_endpgm
;
; GFX1250-FAKE16-LABEL: test_cvt_sr_fp8_f32_byte1_dpp:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-FAKE16-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 clamp quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1250-FAKE16-NEXT: global_store_b32 v[4:5], v2, off
; GFX1250-FAKE16-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: test_cvt_sr_fp8_f32_byte1_dpp:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
; GFX1250-GISEL-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 clamp quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v2, off
; GFX1250-GISEL-NEXT: s_endpgm
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%tmp1 = bitcast i32 %tmp0 to float
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float %tmp1, i32 %r, i32 %old, i32 1)
store i32 %ret, ptr addrspace(1) %out
ret void
}
define float @test_cvt_f32_fp8_e5m3_byte0(i32 %a) {
; GFX1250-LABEL: test_cvt_f32_fp8_e5m3_byte0:
; GFX1250: ; %bb.0:

View File

@ -637,3 +637,51 @@ v_cvt_sr_fp8_f16 v1, v2, v3 op_sel:[1] byte_sel:2
v_cvt_sr_fp8_f16 v1, v2, v3 op_sel:[1] byte_sel:3
// GFX1250: v_cvt_sr_fp8_f16 v1, v2, v3 op_sel:[1,0,0] byte_sel:3 ; encoding: [0x01,0x68,0x74,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_fp8_f32 v1, v2, v3
// GFX1250: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1]
// GFX1250: v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_fp8_f32 v1, -v2, |v3|
// GFX1250: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
v_cvt_pk_fp8_f32 v1, s2, 3
// GFX1250: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
v_cvt_pk_fp8_f32 v1, v2, v3 clamp
// GFX1250: v_cvt_pk_fp8_f32 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x69,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] clamp
// GFX1250: v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] clamp ; encoding: [0x01,0xc0,0x69,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_bf8_f32 v1, v2, v3
// GFX1250: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_bf8_f32 v1, -v2, |v3|
// GFX1250: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
v_cvt_pk_bf8_f32 v1, s2, 3
// GFX1250: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
v_cvt_sr_fp8_f32 v1, v2, v3
// GFX1250: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
v_cvt_sr_fp8_f32 v10, s2, v5
// GFX1250: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00]
v_cvt_sr_fp8_f32 v5, -|v255|, v4
// GFX1250: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
v_cvt_sr_fp8_f32 v1, v2, v3 clamp
// GFX1250: v_cvt_sr_fp8_f32 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x6b,0xd7,0x02,0x07,0x02,0x00]
v_cvt_sr_bf8_f32 v1, v2, v3
// GFX1250: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
v_cvt_sr_bf8_f32 v10, s2, v5
// GFX1250: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00]
v_cvt_sr_bf8_f32 v5, -|v255|, v4
// GFX1250: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]

View File

@ -637,3 +637,51 @@ v_cvt_sr_fp8_f16 v1, v2.h, v3 byte_sel:2
v_cvt_sr_fp8_f16 v1, v2.h, v3 byte_sel:3
// GFX1250: v_cvt_sr_fp8_f16 v1, v2.h, v3 op_sel:[1,0,0] byte_sel:3 ; encoding: [0x01,0x68,0x74,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_fp8_f32 v1.l, v2, v3
// GFX1250: v_cvt_pk_fp8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_fp8_f32 v1.h, v2, v3
// GFX1250: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_fp8_f32 v1, -v2, |v3|
// GFX1250: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
v_cvt_pk_fp8_f32 v1, s2, 3
// GFX1250: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
v_cvt_pk_fp8_f32 v1.l, v2, v3 clamp
// GFX1250: v_cvt_pk_fp8_f32 v1.l, v2, v3 clamp ; encoding: [0x01,0x80,0x69,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_fp8_f32 v1.h, v2, v3 clamp
// GFX1250: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] clamp ; encoding: [0x01,0xc0,0x69,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_bf8_f32 v1, v2, v3
// GFX1250: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_bf8_f32 v1, -v2, |v3|
// GFX1250: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
v_cvt_pk_bf8_f32 v1, s2, 3
// GFX1250: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
v_cvt_sr_fp8_f32 v1, v2, v3
// GFX1250: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
v_cvt_sr_fp8_f32 v10, s2, v5
// GFX1250: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00]
v_cvt_sr_fp8_f32 v5, -|v255|, v4
// GFX1250: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
v_cvt_sr_fp8_f32 v1, v2, v3 clamp
// GFX1250: v_cvt_sr_fp8_f32 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x6b,0xd7,0x02,0x07,0x02,0x00]
v_cvt_sr_bf8_f32 v1, v2, v3
// GFX1250: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
v_cvt_sr_bf8_f32 v10, s2, v5
// GFX1250: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00]
v_cvt_sr_bf8_f32 v5, -|v255|, v4
// GFX1250: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]

View File

@ -250,6 +250,18 @@ v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:
// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x6d,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x82,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
// GFX12-ERR: :[[@LINE-2]]:40: error: invalid operand for instruction
v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| op_sel:[0,0,1] clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| op_sel:[0,0,1] clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0xc2,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
// GFX12-ERR: :[[@LINE-2]]:61: error: not a valid operand.
v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
// GFX1250: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x80,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
// GFX12-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
// GFX1250: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

View File

@ -250,6 +250,18 @@ v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:
// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x6d,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x82,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
// GFX12-ERR: :[[@LINE-2]]:42: error: invalid operand for instruction
v_cvt_pk_fp8_f32_e64_dpp v1.h, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v1.h, -v2, |v3| op_sel:[0,0,1] clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0xc2,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
// GFX12-ERR: :[[@LINE-2]]:42: error: invalid operand for instruction
v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
// GFX1250: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x80,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
// GFX12-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
// GFX1250: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

View File

@ -170,6 +170,18 @@ v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x6d,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1]
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
// GFX12-ERR: :[[@LINE-2]]:37: error: invalid operand for instruction
v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 op_sel:[0,0,1] clamp dpp8:[7,6,5,4,2,3,0,1]
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 op_sel:[0,0,1] clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0xc0,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
// GFX12-ERR: :[[@LINE-2]]:58: error: not a valid operand.
v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1]
// GFX1250: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
// GFX12-ERR: :[[@LINE-2]]:37: error: invalid operand for instruction
v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
// GFX1250: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

View File

@ -170,6 +170,18 @@ v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x6d,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1]
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
// GFX12-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
v_cvt_pk_fp8_f32_e64_dpp v5.h, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1]
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v5.h, v1, v2 op_sel:[0,0,1] clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0xc0,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
// GFX12-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1]
// GFX1250: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
// GFX12-ERR: :[[@LINE-2]]:37: error: invalid operand for instruction
v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
// GFX1250: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

View File

@ -679,3 +679,60 @@
0x01,0x09,0x74,0xd7,0x02,0x07,0x02,0x00
# GFX1250-REAL16: v_cvt_sr_fp8_f16 v1, |v2.h|, v3 op_sel:[1,0,0] ; encoding: [0x01,0x09,0x74,0xd7,0x02,0x07,0x02,0x00]
# GFX1250-FAKE16: v_cvt_sr_fp8_f16 v1, |v2|, v3 op_sel:[1,0,0] ; encoding: [0x01,0x09,0x74,0xd7,0x02,0x07,0x02,0x00]
0x01,0x80,0x6b,0xd7,0x02,0x07,0x02,0x00
# GFX1250: v_cvt_sr_fp8_f32 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x6b,0xd7,0x02,0x07,0x02,0x00]
0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00
# GFX1250-REAL16: v_cvt_pk_fp8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
# GFX1250-FAKE16: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00
# GFX1250-REAL16: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
# GFX1250-FAKE16: v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20
# GFX1250-REAL16: v_cvt_pk_fp8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
# GFX1250-FAKE16: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00
# GFX1250-REAL16: v_cvt_pk_fp8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
# GFX1250-FAKE16: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
0x01,0x80,0x69,0xd7,0x02,0x07,0x02,0x00
# GFX1250-REAL16: v_cvt_pk_fp8_f32 v1.l, v2, v3 clamp ; encoding: [0x01,0x80,0x69,0xd7,0x02,0x07,0x02,0x00]
# GFX1250-FAKE16: v_cvt_pk_fp8_f32 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x69,0xd7,0x02,0x07,0x02,0x00]
0x01,0xc0,0x69,0xd7,0x02,0x07,0x02,0x00
# GFX1250-REAL16: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] clamp ; encoding: [0x01,0xc0,0x69,0xd7,0x02,0x07,0x02,0x00]
# GFX1250-FAKE16: v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] clamp ; encoding: [0x01,0xc0,0x69,0xd7,0x02,0x07,0x02,0x00]
0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00
# GFX1250-REAL16: v_cvt_pk_bf8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
# GFX1250-FAKE16: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20
# GFX1250-REAL16: v_cvt_pk_bf8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
# GFX1250-FAKE16: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00
# GFX1250-REAL16: v_cvt_pk_bf8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
# GFX1250-FAKE16: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00
# GFX1250: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00
# GFX1250: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00]
0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20
# GFX1250: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00
# GFX1250: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00
# GFX1250: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00]
0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20
# GFX1250: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]

View File

@ -210,6 +210,17 @@
0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
0x01,0x82,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
# GFX1250-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x82,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
# GFX1250-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x82,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
0x01,0xc2,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
# GFX1250-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.h, -v2, |v3| op_sel:[0,0,1] clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0xc2,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
# GFX1250-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| op_sel:[0,0,1] clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0xc2,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
0x01,0x80,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
# GFX1250: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x80,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
0xff,0x83,0x6e,0xd7,0xfa,0xfe,0xf7,0x7b,0xff,0x6f,0x05,0x30
# GFX1250: v_cvt_sr_pk_bf16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x83,0x6e,0xd7,0xfa,0xfe,0xf7,0x7b,0xff,0x6f,0x05,0x30]

View File

@ -146,6 +146,17 @@
0x05,0x00,0x6d,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05
# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
0x05,0x80,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21
# GFX1250-REAL16: v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
# GFX1250-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
0x05,0xc0,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21
# GFX1250-REAL16: v_cvt_pk_fp8_f32_e64_dpp v5.h, v1, v2 op_sel:[0,0,1] clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0xc0,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
# GFX1250-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 op_sel:[0,0,1] clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0xc0,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
0x05,0x80,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21
# GFX1250: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
0xff,0x83,0x6e,0xd7,0xe9,0xfe,0xf7,0x7b,0xff,0x00,0x00,0x00
# GFX1250: v_cvt_sr_pk_bf16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x83,0x6e,0xd7,0xe9,0xfe,0xf7,0x7b,0xff,0x00,0x00,0x00]