[AMDGPU] Add gfx1250 cvt_pk|sr_fp8|bf8_f32 instructions (#151595)
This commit is contained in:
parent
d3a9cde7b8
commit
49d89bc9f4
@ -707,6 +707,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f16, "sV2h", "nc", "gfx1250-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f16, "sV2h", "nc", "gfx1250-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f16, "ihiUiIi", "nc", "gfx1250-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f16, "ihiUiIi", "nc", "gfx1250-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts")
|
||||
|
||||
|
@ -652,6 +652,60 @@ void test_prefetch(generic void *fptr, global void *gptr) {
|
||||
__builtin_amdgcn_global_prefetch(gptr, 8);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_cvt_pk_fp8_f32_e5m3(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
|
||||
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
|
||||
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: store float [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float [[TMP0]], float [[TMP1]], i32 [[TMP2]], i1 true)
|
||||
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_cvt_pk_fp8_f32_e5m3(global int* out, int old, float a, float b)
|
||||
{
|
||||
*out = __builtin_amdgcn_cvt_pk_fp8_f32_e5m3(a, b, old, true);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_cvt_sr_fp8_f32_e5m3(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
|
||||
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
|
||||
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 3)
|
||||
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_cvt_sr_fp8_f32_e5m3(global int* out, int old, float a, int b)
|
||||
{
|
||||
*out = __builtin_amdgcn_cvt_sr_fp8_f32_e5m3(a, b, old, 3);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
|
||||
|
@ -3505,6 +3505,12 @@ def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">,
|
||||
[llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
|
||||
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
|
||||
|
||||
// llvm.amdgcn.cvt.pk.fp8.f32.e5m3 int vdst, float srcA, float srcB, int old, imm word_sel
|
||||
def int_amdgcn_cvt_pk_fp8_f32_e5m3 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32_e5m3">,
|
||||
DefaultAttrsIntrinsic<[llvm_i32_ty],
|
||||
[llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
|
||||
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
|
||||
|
||||
// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
|
||||
// byte_sel selects byte to write into vdst.
|
||||
def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">,
|
||||
@ -3518,6 +3524,12 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
|
||||
[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
|
||||
|
||||
// llvm.amdgcn.cvt.sr.fp8.f32.e5m3 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
|
||||
def int_amdgcn_cvt_sr_fp8_f32_e5m3 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32_e5m3">,
|
||||
DefaultAttrsIntrinsic<[llvm_i32_ty],
|
||||
[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
|
||||
|
||||
// llvm.amdgcn.cvt.off.fp32.i4 int srcA
|
||||
def int_amdgcn_cvt_off_f32_i4: ClangBuiltin<"__builtin_amdgcn_cvt_off_f32_i4">,
|
||||
DefaultAttrsIntrinsic<[llvm_float_ty],
|
||||
|
@ -4633,8 +4633,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||
case Intrinsic::amdgcn_cvt_pk_f32_fp8:
|
||||
case Intrinsic::amdgcn_cvt_pk_f32_bf8:
|
||||
case Intrinsic::amdgcn_cvt_pk_fp8_f32:
|
||||
case Intrinsic::amdgcn_cvt_pk_fp8_f32_e5m3:
|
||||
case Intrinsic::amdgcn_cvt_pk_bf8_f32:
|
||||
case Intrinsic::amdgcn_cvt_sr_fp8_f32:
|
||||
case Intrinsic::amdgcn_cvt_sr_fp8_f32_e5m3:
|
||||
case Intrinsic::amdgcn_cvt_sr_bf8_f32:
|
||||
case Intrinsic::amdgcn_cvt_sr_bf16_f32:
|
||||
case Intrinsic::amdgcn_cvt_sr_f16_f32:
|
||||
|
@ -9366,6 +9366,10 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
|
||||
}
|
||||
}
|
||||
|
||||
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
|
||||
addOptionalImmOperand(Inst, Operands, OptionalIdx,
|
||||
AMDGPUOperand::ImmTyClamp);
|
||||
|
||||
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
|
||||
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in))
|
||||
Inst.addOperand(Inst.getOperand(0));
|
||||
@ -9373,10 +9377,6 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
|
||||
AMDGPUOperand::ImmTyByteSel);
|
||||
}
|
||||
|
||||
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
|
||||
addOptionalImmOperand(Inst, Operands, OptionalIdx,
|
||||
AMDGPUOperand::ImmTyClamp);
|
||||
|
||||
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod))
|
||||
addOptionalImmOperand(Inst, Operands, OptionalIdx,
|
||||
AMDGPUOperand::ImmTyOModSI);
|
||||
@ -9430,6 +9430,8 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
|
||||
Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx12 ||
|
||||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
|
||||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
|
||||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx1250_e64_dpp_gfx1250 ||
|
||||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx1250_e64_dpp8_gfx1250 ||
|
||||
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
|
||||
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 ||
|
||||
Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp_gfx1250 ||
|
||||
@ -10038,9 +10040,12 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
|
||||
addOptionalImmOperand(Inst, Operands, OptionalIdx,
|
||||
AMDGPUOperand::ImmTyClamp);
|
||||
|
||||
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel))
|
||||
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
|
||||
if (VdstInIdx == static_cast<int>(Inst.getNumOperands()))
|
||||
Inst.addOperand(Inst.getOperand(0));
|
||||
addOptionalImmOperand(Inst, Operands, OptionalIdx,
|
||||
AMDGPUOperand::ImmTyByteSel);
|
||||
}
|
||||
|
||||
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod))
|
||||
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
|
||||
|
@ -625,8 +625,9 @@ def shl_0_to_4 : PatFrag<
|
||||
}];
|
||||
}
|
||||
|
||||
def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
|
||||
defvar Tail = (ins VGPR_32:$vdst_in, op_sel0:$op_sel);
|
||||
class VOP3_CVT_PK_F8_F32_Profile<bit _HasClamp = 0> : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
|
||||
defvar Tail = !con(!if(_HasClamp, (ins Clamp:$clamp), (ins)),
|
||||
(ins VGPR_32:$vdst_in, op_sel0:$op_sel));
|
||||
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
|
||||
0, HasModifiers, HasSrc2Mods,
|
||||
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
|
||||
@ -636,12 +637,13 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
|
||||
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
|
||||
Src2ModVOP3DPP, false>.ret,
|
||||
Tail);
|
||||
let HasClamp = 0;
|
||||
let HasClamp = _HasClamp;
|
||||
let HasExtVOP3DPP = 1;
|
||||
}
|
||||
|
||||
def VOP3_CVT_PK_F8_F32_Profile_fake16 : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP3_OPSEL> {
|
||||
defvar Tail = (ins VGPR_32:$vdst_in, op_sel0:$op_sel);
|
||||
class VOP3_CVT_PK_F8_F32_Profile_fake16<bit _HasClamp = 0> : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP3_OPSEL> {
|
||||
defvar Tail = !con(!if(_HasClamp, (ins Clamp:$clamp), (ins)),
|
||||
(ins VGPR_32:$vdst_in, op_sel0:$op_sel));
|
||||
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
|
||||
0, HasModifiers, HasSrc2Mods,
|
||||
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
|
||||
@ -651,14 +653,15 @@ def VOP3_CVT_PK_F8_F32_Profile_fake16 : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP
|
||||
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
|
||||
Src2ModVOP3DPP, false>.ret,
|
||||
Tail);
|
||||
let HasClamp = 0;
|
||||
let HasClamp = _HasClamp;
|
||||
let HasExtVOP3DPP = 1;
|
||||
}
|
||||
|
||||
// This t16 profile with vdst_in operand is for backward compatibility and is used
|
||||
// for user controlled packing
|
||||
def VOP3_CVT_PK_F8_F32_Profile_t16 : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_OPSEL> {
|
||||
defvar Tail = (ins VGPR_16:$vdst_in, op_sel0:$op_sel);
|
||||
class VOP3_CVT_PK_F8_F32_Profile_t16<bit _HasClamp = 0> : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_OPSEL> {
|
||||
defvar Tail = !con(!if(_HasClamp, (ins Clamp:$clamp), (ins)),
|
||||
(ins VGPR_16:$vdst_in, op_sel0:$op_sel));
|
||||
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
|
||||
0, HasModifiers, HasSrc2Mods,
|
||||
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
|
||||
@ -668,7 +671,7 @@ def VOP3_CVT_PK_F8_F32_Profile_t16 : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_O
|
||||
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
|
||||
Src2ModVOP3DPP, false>.ret,
|
||||
Tail);
|
||||
let HasClamp = 0;
|
||||
let HasClamp = _HasClamp;
|
||||
let HasExtVOP3DPP = 1;
|
||||
}
|
||||
|
||||
@ -702,10 +705,10 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
|
||||
HasModifiers, DstVT>.ret);
|
||||
}
|
||||
|
||||
class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
|
||||
class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT, bit _HasClamp = 0> :
|
||||
VOP3_Profile<VOPProfile<[i32, SrcVT, i32, untyped]>> {
|
||||
let HasFP8DstByteSel = 1;
|
||||
let HasClamp = 0;
|
||||
let HasClamp = _HasClamp;
|
||||
}
|
||||
|
||||
def IsPow2Plus1: PatLeaf<(i32 imm), [{
|
||||
@ -780,15 +783,23 @@ defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", V_LSHL_ADD_U64_PROF>;
|
||||
let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
|
||||
SchedRW = [WriteFloatCvt] in {
|
||||
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
|
||||
defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile,
|
||||
VOP3_CVT_PK_F8_F32_Profile_t16,
|
||||
VOP3_CVT_PK_F8_F32_Profile_fake16>;
|
||||
defm V_CVT_PK_BF8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile,
|
||||
VOP3_CVT_PK_F8_F32_Profile_t16,
|
||||
VOP3_CVT_PK_F8_F32_Profile_fake16>;
|
||||
let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
|
||||
defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile<>,
|
||||
VOP3_CVT_PK_F8_F32_Profile_t16<>,
|
||||
VOP3_CVT_PK_F8_F32_Profile_fake16<>>;
|
||||
let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in
|
||||
defm V_CVT_PK_FP8_F32_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32_gfx1250", VOP3_CVT_PK_F8_F32_Profile<true>,
|
||||
VOP3_CVT_PK_F8_F32_Profile_t16<true>,
|
||||
VOP3_CVT_PK_F8_F32_Profile_fake16<true>>;
|
||||
defm V_CVT_PK_BF8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile<>,
|
||||
VOP3_CVT_PK_F8_F32_Profile_t16<>,
|
||||
VOP3_CVT_PK_F8_F32_Profile_fake16<>>;
|
||||
|
||||
let SubtargetPredicate = isGFX12Plus in {
|
||||
defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
|
||||
let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
|
||||
defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
|
||||
let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in
|
||||
defm V_CVT_SR_FP8_F32_gfx1250 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx1250", VOP3_CVT_SR_F8_ByteSel_Profile<f32, true>>;
|
||||
defm V_CVT_SR_BF8_F32_gfx12 : VOP3Inst<"v_cvt_sr_bf8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
|
||||
}
|
||||
}
|
||||
@ -807,6 +818,11 @@ class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : G
|
||||
(inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0)
|
||||
>;
|
||||
|
||||
class Cvt_PK_F8_F32_E5M3_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst, int Clamp> : GCNPat<
|
||||
(i32 (node f32:$src0, f32:$src1, i32:$old, index)),
|
||||
(inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, Clamp, $old, 0)
|
||||
>;
|
||||
|
||||
multiclass Cvt_PK_F8_F32_t16_Pat<SDPatternOperator node, VOP3_Pseudo inst> {
|
||||
def : GCNPat<
|
||||
(i32 (node f32:$src0, f32:$src1, i32:$old, -1)),
|
||||
@ -822,6 +838,21 @@ def : GCNPat<
|
||||
>;
|
||||
}
|
||||
|
||||
multiclass Cvt_PK_F8_F32_E5M3_t16_Pat<SDPatternOperator node, VOP3_Pseudo inst, int Clamp> {
|
||||
def : GCNPat<
|
||||
(i32 (node f32:$src0, f32:$src1, i32:$old, -1)),
|
||||
(REG_SEQUENCE VGPR_32,
|
||||
(i16 (EXTRACT_SUBREG $old, lo16)), lo16,
|
||||
(i16 (inst SRCMODS.DST_OP_SEL, $src0, 0, $src1, Clamp, (i16 (EXTRACT_SUBREG $old, hi16)), 0)), hi16)
|
||||
>;
|
||||
def : GCNPat<
|
||||
(i32 (node f32:$src0, f32:$src1, i32:$old, 0)),
|
||||
(REG_SEQUENCE VGPR_32,
|
||||
(i16 (inst 0, $src0, 0, $src1, Clamp, (i16 (EXTRACT_SUBREG $old, lo16)), 0)), lo16,
|
||||
(i16 (EXTRACT_SUBREG $old, hi16)), hi16)
|
||||
>;
|
||||
}
|
||||
|
||||
class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
|
||||
(i32 (node f32:$src0, i32:$src1, i32:$old, index)),
|
||||
(inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
|
||||
@ -834,21 +865,37 @@ class Cvt_SR_F8_ByteSel_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType
|
||||
(inst $src0_modifiers, $src0, $src1_modifiers, $src1, $old, (as_i32timm $byte_sel))
|
||||
>;
|
||||
|
||||
class Cvt_SR_F8_ByteSel_E5M3_Pat<SDPatternOperator node, VOP3_Pseudo inst,
|
||||
ValueType SrcVT, int Clamp> : GCNPat<
|
||||
(i32 (node (VOP3Mods SrcVT:$src0, i32:$src0_modifiers), (VOP3Mods i32:$src1, i32:$src1_modifiers),
|
||||
i32:$old, timm:$byte_sel)),
|
||||
(inst $src0_modifiers, $src0, $src1_modifiers, $src1, Clamp, $old, (as_i32timm $byte_sel))
|
||||
>;
|
||||
|
||||
let OtherPredicates = [HasFP8ConversionInsts] in {
|
||||
foreach Index = [0, -1] in {
|
||||
let True16Predicate = NotHasTrue16BitInsts in {
|
||||
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
|
||||
let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
|
||||
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
|
||||
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
|
||||
}
|
||||
let True16Predicate = UseFakeTrue16Insts in {
|
||||
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_fake16_e64>;
|
||||
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_fake16_e64>;
|
||||
let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in {
|
||||
def : Cvt_PK_F8_F32_E5M3_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_gfx1250_fake16_e64, DSTCLAMP.NONE>;
|
||||
def : Cvt_PK_F8_F32_E5M3_Pat<int_amdgcn_cvt_pk_fp8_f32_e5m3, Index, V_CVT_PK_FP8_F32_gfx1250_fake16_e64, DSTCLAMP.ENABLE>;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let True16Predicate = UseRealTrue16Insts in {
|
||||
defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_fp8_f32, V_CVT_PK_FP8_F32_t16_e64>;
|
||||
defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_bf8_f32, V_CVT_PK_BF8_F32_t16_e64>;
|
||||
let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in {
|
||||
defm : Cvt_PK_F8_F32_E5M3_t16_Pat<int_amdgcn_cvt_pk_fp8_f32, V_CVT_PK_FP8_F32_gfx1250_t16_e64, DSTCLAMP.NONE>;
|
||||
defm : Cvt_PK_F8_F32_E5M3_t16_Pat<int_amdgcn_cvt_pk_fp8_f32_e5m3, V_CVT_PK_FP8_F32_gfx1250_t16_e64, DSTCLAMP.ENABLE>;
|
||||
}
|
||||
}
|
||||
|
||||
let SubtargetPredicate = isGFX940Plus in {
|
||||
@ -859,7 +906,12 @@ let SubtargetPredicate = isGFX940Plus in {
|
||||
}
|
||||
|
||||
let SubtargetPredicate = isGFX12Plus in {
|
||||
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>;
|
||||
let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
|
||||
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>;
|
||||
let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in {
|
||||
def : Cvt_SR_F8_ByteSel_E5M3_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx1250_e64, f32, DSTCLAMP.NONE>;
|
||||
def : Cvt_SR_F8_ByteSel_E5M3_Pat<int_amdgcn_cvt_sr_fp8_f32_e5m3, V_CVT_SR_FP8_F32_gfx1250_e64, f32, DSTCLAMP.ENABLE>;
|
||||
}
|
||||
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f32, V_CVT_SR_BF8_F32_gfx12_e64, f32>;
|
||||
}
|
||||
}
|
||||
@ -1892,11 +1944,6 @@ defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>;
|
||||
defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>;
|
||||
defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>;
|
||||
|
||||
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">;
|
||||
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
|
||||
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
|
||||
defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// GFX11, GFX12
|
||||
//===----------------------------------------------------------------------===//
|
||||
@ -2057,6 +2104,13 @@ defm V_AND_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x36
|
||||
defm V_OR_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x363, "v_or_b16">;
|
||||
defm V_XOR_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x364, "v_xor_b16">;
|
||||
|
||||
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_not_gfx1250<0x369, "v_cvt_pk_fp8_f32">;
|
||||
defm V_CVT_PK_FP8_F32_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x369, "v_cvt_pk_fp8_f32">;
|
||||
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
|
||||
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx11_gfx12_not_gfx1250<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32">;
|
||||
defm V_CVT_SR_FP8_F32_gfx1250 : VOP3Only_Realtriple_with_name_gfx1250<0x36b, "V_CVT_SR_FP8_F32_gfx1250", "v_cvt_sr_fp8_f32">;
|
||||
defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx11_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;
|
||||
|
||||
let AssemblerPredicate = isGFX11Plus in {
|
||||
def : AMDGPUMnemonicAlias<"v_add3_nc_u32", "v_add3_u32">;
|
||||
def : AMDGPUMnemonicAlias<"v_xor_add_u32", "v_xad_u32">;
|
||||
|
@ -2010,6 +2010,23 @@ multiclass VOP3_BITOP3_Real_Base<GFXGen Gen, bits<10> op, string asmName> {
|
||||
}
|
||||
}
|
||||
|
||||
multiclass VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<bits<10> op, string asmName, string opName = NAME,
|
||||
string pseudo_mnemonic = "", bit isSingle = 0> :
|
||||
VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
|
||||
VOP3_Realtriple_with_name<GFX12Not12_50Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
|
||||
|
||||
multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_not_gfx1250<bits<10> op, string asmName,
|
||||
string opName = NAME, string pseudo_mnemonic = ""> {
|
||||
defm _t16 : VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<op, asmName, opName#"_t16", pseudo_mnemonic, 1>;
|
||||
defm _fake16 : VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<op, asmName, opName#"_fake16", pseudo_mnemonic, 1>;
|
||||
}
|
||||
|
||||
multiclass VOP3_Realtriple_with_name_gfx11_gfx12_not_gfx1250<bits<10> op, string opName,
|
||||
string asmName, string pseudo_mnemonic = "",
|
||||
bit isSingle = 0> :
|
||||
VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
|
||||
VOP3_Realtriple_with_name<GFX12Not12_50Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// VOP3 GFX11
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -1,10 +1,188 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-TRUE16 %s
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
|
||||
|
||||
declare i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float, float, i32, i1)
|
||||
declare i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float, i32, i32, i32)
|
||||
declare float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32, i32)
|
||||
|
||||
define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) {
|
||||
; GFX1250-TRUE16-LABEL: test_cvt_pk_fp8_f32_word0:
|
||||
; GFX1250-TRUE16: ; %bb.0:
|
||||
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.l, v0, v1 clamp
|
||||
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
|
||||
;
|
||||
; GFX1250-FAKE16-LABEL: test_cvt_pk_fp8_f32_word0:
|
||||
; GFX1250-FAKE16: ; %bb.0:
|
||||
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 clamp
|
||||
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
|
||||
;
|
||||
; GFX1250-GISEL-LABEL: test_cvt_pk_fp8_f32_word0:
|
||||
; GFX1250-GISEL: ; %bb.0:
|
||||
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 clamp
|
||||
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float %x, float %y, i32 %old, i1 false)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
|
||||
; GFX1250-TRUE16-LABEL: test_cvt_pk_fp8_f32_word1:
|
||||
; GFX1250-TRUE16: ; %bb.0:
|
||||
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.h, v0, v1 op_sel:[0,0,1] clamp
|
||||
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
|
||||
;
|
||||
; GFX1250-FAKE16-LABEL: test_cvt_pk_fp8_f32_word1:
|
||||
; GFX1250-FAKE16: ; %bb.0:
|
||||
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] clamp
|
||||
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
|
||||
;
|
||||
; GFX1250-GISEL-LABEL: test_cvt_pk_fp8_f32_word1:
|
||||
; GFX1250-GISEL: ; %bb.0:
|
||||
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] clamp
|
||||
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float %x, float %y, i32 %old, i1 true)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define amdgpu_cs void @test_cvt_pk_fp8_f32_word1_dpp(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) {
|
||||
; GFX1250-TRUE16-LABEL: test_cvt_pk_fp8_f32_word1_dpp:
|
||||
; GFX1250-TRUE16: ; %bb.0:
|
||||
; GFX1250-TRUE16-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
||||
; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
|
||||
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX1250-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.h, v0, v1 op_sel:[0,0,1] clamp
|
||||
; GFX1250-TRUE16-NEXT: global_store_b32 v[4:5], v2, off
|
||||
; GFX1250-TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-FAKE16-LABEL: test_cvt_pk_fp8_f32_word1_dpp:
|
||||
; GFX1250-FAKE16: ; %bb.0:
|
||||
; GFX1250-FAKE16-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
||||
; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
|
||||
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX1250-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] clamp
|
||||
; GFX1250-FAKE16-NEXT: global_store_b32 v[4:5], v2, off
|
||||
; GFX1250-FAKE16-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-GISEL-LABEL: test_cvt_pk_fp8_f32_word1_dpp:
|
||||
; GFX1250-GISEL: ; %bb.0:
|
||||
; GFX1250-GISEL-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
||||
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
|
||||
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX1250-GISEL-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] clamp
|
||||
; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v2, off
|
||||
; GFX1250-GISEL-NEXT: s_endpgm
|
||||
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
|
||||
%tmp1 = bitcast i32 %tmp0 to float
|
||||
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float %tmp1, float %y, i32 %old, i1 true)
|
||||
store i32 %ret, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) {
|
||||
; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte0:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 clamp
|
||||
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float %x, i32 %r, i32 %old, i32 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
|
||||
; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte1:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:1 clamp
|
||||
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float %x, i32 %r, i32 %old, i32 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) {
|
||||
; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte2:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:2 clamp
|
||||
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float %x, i32 %r, i32 %old, i32 2)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) {
|
||||
; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte3:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:3 clamp
|
||||
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float %x, i32 %r, i32 %old, i32 3)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1_dpp(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
|
||||
; GFX1250-TRUE16-LABEL: test_cvt_sr_fp8_f32_byte1_dpp:
|
||||
; GFX1250-TRUE16: ; %bb.0:
|
||||
; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
|
||||
; GFX1250-TRUE16-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 clamp quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
||||
; GFX1250-TRUE16-NEXT: global_store_b32 v[4:5], v2, off
|
||||
; GFX1250-TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-FAKE16-LABEL: test_cvt_sr_fp8_f32_byte1_dpp:
|
||||
; GFX1250-FAKE16: ; %bb.0:
|
||||
; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
|
||||
; GFX1250-FAKE16-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 clamp quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
||||
; GFX1250-FAKE16-NEXT: global_store_b32 v[4:5], v2, off
|
||||
; GFX1250-FAKE16-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-GISEL-LABEL: test_cvt_sr_fp8_f32_byte1_dpp:
|
||||
; GFX1250-GISEL: ; %bb.0:
|
||||
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
|
||||
; GFX1250-GISEL-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 clamp quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
||||
; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v2, off
|
||||
; GFX1250-GISEL-NEXT: s_endpgm
|
||||
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
|
||||
%tmp1 = bitcast i32 %tmp0 to float
|
||||
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float %tmp1, i32 %r, i32 %old, i32 1)
|
||||
store i32 %ret, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define float @test_cvt_f32_fp8_e5m3_byte0(i32 %a) {
|
||||
; GFX1250-LABEL: test_cvt_f32_fp8_e5m3_byte0:
|
||||
; GFX1250: ; %bb.0:
|
||||
|
@ -637,3 +637,51 @@ v_cvt_sr_fp8_f16 v1, v2, v3 op_sel:[1] byte_sel:2
|
||||
|
||||
v_cvt_sr_fp8_f16 v1, v2, v3 op_sel:[1] byte_sel:3
|
||||
// GFX1250: v_cvt_sr_fp8_f16 v1, v2, v3 op_sel:[1,0,0] byte_sel:3 ; encoding: [0x01,0x68,0x74,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_pk_fp8_f32 v1, v2, v3
|
||||
// GFX1250: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1]
|
||||
// GFX1250: v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_pk_fp8_f32 v1, -v2, |v3|
|
||||
// GFX1250: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
|
||||
|
||||
v_cvt_pk_fp8_f32 v1, s2, 3
|
||||
// GFX1250: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
|
||||
|
||||
v_cvt_pk_fp8_f32 v1, v2, v3 clamp
|
||||
// GFX1250: v_cvt_pk_fp8_f32 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] clamp
|
||||
// GFX1250: v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] clamp ; encoding: [0x01,0xc0,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_pk_bf8_f32 v1, v2, v3
|
||||
// GFX1250: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_pk_bf8_f32 v1, -v2, |v3|
|
||||
// GFX1250: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
|
||||
|
||||
v_cvt_pk_bf8_f32 v1, s2, 3
|
||||
// GFX1250: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
|
||||
|
||||
v_cvt_sr_fp8_f32 v1, v2, v3
|
||||
// GFX1250: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_sr_fp8_f32 v10, s2, v5
|
||||
// GFX1250: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00]
|
||||
|
||||
v_cvt_sr_fp8_f32 v5, -|v255|, v4
|
||||
// GFX1250: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
|
||||
|
||||
v_cvt_sr_fp8_f32 v1, v2, v3 clamp
|
||||
// GFX1250: v_cvt_sr_fp8_f32 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x6b,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_sr_bf8_f32 v1, v2, v3
|
||||
// GFX1250: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_sr_bf8_f32 v10, s2, v5
|
||||
// GFX1250: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00]
|
||||
|
||||
v_cvt_sr_bf8_f32 v5, -|v255|, v4
|
||||
// GFX1250: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
|
||||
|
@ -637,3 +637,51 @@ v_cvt_sr_fp8_f16 v1, v2.h, v3 byte_sel:2
|
||||
|
||||
v_cvt_sr_fp8_f16 v1, v2.h, v3 byte_sel:3
|
||||
// GFX1250: v_cvt_sr_fp8_f16 v1, v2.h, v3 op_sel:[1,0,0] byte_sel:3 ; encoding: [0x01,0x68,0x74,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_pk_fp8_f32 v1.l, v2, v3
|
||||
// GFX1250: v_cvt_pk_fp8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_pk_fp8_f32 v1.h, v2, v3
|
||||
// GFX1250: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_pk_fp8_f32 v1, -v2, |v3|
|
||||
// GFX1250: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
|
||||
|
||||
v_cvt_pk_fp8_f32 v1, s2, 3
|
||||
// GFX1250: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
|
||||
|
||||
v_cvt_pk_fp8_f32 v1.l, v2, v3 clamp
|
||||
// GFX1250: v_cvt_pk_fp8_f32 v1.l, v2, v3 clamp ; encoding: [0x01,0x80,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_pk_fp8_f32 v1.h, v2, v3 clamp
|
||||
// GFX1250: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] clamp ; encoding: [0x01,0xc0,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_pk_bf8_f32 v1, v2, v3
|
||||
// GFX1250: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_pk_bf8_f32 v1, -v2, |v3|
|
||||
// GFX1250: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
|
||||
|
||||
v_cvt_pk_bf8_f32 v1, s2, 3
|
||||
// GFX1250: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
|
||||
|
||||
v_cvt_sr_fp8_f32 v1, v2, v3
|
||||
// GFX1250: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_sr_fp8_f32 v10, s2, v5
|
||||
// GFX1250: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00]
|
||||
|
||||
v_cvt_sr_fp8_f32 v5, -|v255|, v4
|
||||
// GFX1250: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
|
||||
|
||||
v_cvt_sr_fp8_f32 v1, v2, v3 clamp
|
||||
// GFX1250: v_cvt_sr_fp8_f32 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x6b,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_sr_bf8_f32 v1, v2, v3
|
||||
// GFX1250: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
v_cvt_sr_bf8_f32 v10, s2, v5
|
||||
// GFX1250: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00]
|
||||
|
||||
v_cvt_sr_bf8_f32 v5, -|v255|, v4
|
||||
// GFX1250: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
|
||||
|
@ -250,6 +250,18 @@ v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:
|
||||
// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x6d,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30]
|
||||
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
|
||||
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x82,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
|
||||
// GFX12-ERR: :[[@LINE-2]]:40: error: invalid operand for instruction
|
||||
|
||||
v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| op_sel:[0,0,1] clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
|
||||
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| op_sel:[0,0,1] clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0xc2,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
|
||||
// GFX12-ERR: :[[@LINE-2]]:61: error: not a valid operand.
|
||||
|
||||
v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
|
||||
// GFX1250: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x80,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
|
||||
// GFX12-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
|
||||
|
||||
v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
|
||||
// GFX1250: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
|
||||
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
|
||||
|
@ -250,6 +250,18 @@ v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:
|
||||
// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x6d,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30]
|
||||
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
|
||||
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x82,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
|
||||
// GFX12-ERR: :[[@LINE-2]]:42: error: invalid operand for instruction
|
||||
|
||||
v_cvt_pk_fp8_f32_e64_dpp v1.h, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
|
||||
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v1.h, -v2, |v3| op_sel:[0,0,1] clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0xc2,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
|
||||
// GFX12-ERR: :[[@LINE-2]]:42: error: invalid operand for instruction
|
||||
|
||||
v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
|
||||
// GFX1250: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x80,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
|
||||
// GFX12-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
|
||||
|
||||
v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
|
||||
// GFX1250: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
|
||||
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
|
||||
|
@ -170,6 +170,18 @@ v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
|
||||
// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x6d,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1]
|
||||
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
|
||||
// GFX12-ERR: :[[@LINE-2]]:37: error: invalid operand for instruction
|
||||
|
||||
v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 op_sel:[0,0,1] clamp dpp8:[7,6,5,4,2,3,0,1]
|
||||
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 op_sel:[0,0,1] clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0xc0,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
|
||||
// GFX12-ERR: :[[@LINE-2]]:58: error: not a valid operand.
|
||||
|
||||
v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1]
|
||||
// GFX1250: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
|
||||
// GFX12-ERR: :[[@LINE-2]]:37: error: invalid operand for instruction
|
||||
|
||||
v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
|
||||
// GFX1250: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
|
||||
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
|
||||
|
@ -170,6 +170,18 @@ v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
|
||||
// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x6d,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1]
|
||||
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
|
||||
// GFX12-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
|
||||
|
||||
v_cvt_pk_fp8_f32_e64_dpp v5.h, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1]
|
||||
// GFX1250: v_cvt_pk_fp8_f32_e64_dpp v5.h, v1, v2 op_sel:[0,0,1] clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0xc0,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
|
||||
// GFX12-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
|
||||
|
||||
v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1]
|
||||
// GFX1250: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
|
||||
// GFX12-ERR: :[[@LINE-2]]:37: error: invalid operand for instruction
|
||||
|
||||
v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
|
||||
// GFX1250: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
|
||||
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
|
||||
|
@ -679,3 +679,60 @@
|
||||
0x01,0x09,0x74,0xd7,0x02,0x07,0x02,0x00
|
||||
# GFX1250-REAL16: v_cvt_sr_fp8_f16 v1, |v2.h|, v3 op_sel:[1,0,0] ; encoding: [0x01,0x09,0x74,0xd7,0x02,0x07,0x02,0x00]
|
||||
# GFX1250-FAKE16: v_cvt_sr_fp8_f16 v1, |v2|, v3 op_sel:[1,0,0] ; encoding: [0x01,0x09,0x74,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
0x01,0x80,0x6b,0xd7,0x02,0x07,0x02,0x00
|
||||
# GFX1250: v_cvt_sr_fp8_f32 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x6b,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00
|
||||
# GFX1250-REAL16: v_cvt_pk_fp8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
# GFX1250-FAKE16: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00
|
||||
# GFX1250-REAL16: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
# GFX1250-FAKE16: v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20
|
||||
# GFX1250-REAL16: v_cvt_pk_fp8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
|
||||
# GFX1250-FAKE16: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
|
||||
|
||||
0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00
|
||||
# GFX1250-REAL16: v_cvt_pk_fp8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
|
||||
# GFX1250-FAKE16: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
|
||||
|
||||
0x01,0x80,0x69,0xd7,0x02,0x07,0x02,0x00
|
||||
# GFX1250-REAL16: v_cvt_pk_fp8_f32 v1.l, v2, v3 clamp ; encoding: [0x01,0x80,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
# GFX1250-FAKE16: v_cvt_pk_fp8_f32 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
0x01,0xc0,0x69,0xd7,0x02,0x07,0x02,0x00
|
||||
# GFX1250-REAL16: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] clamp ; encoding: [0x01,0xc0,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
# GFX1250-FAKE16: v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] clamp ; encoding: [0x01,0xc0,0x69,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00
|
||||
# GFX1250-REAL16: v_cvt_pk_bf8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
|
||||
# GFX1250-FAKE16: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20
|
||||
# GFX1250-REAL16: v_cvt_pk_bf8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
|
||||
# GFX1250-FAKE16: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
|
||||
|
||||
0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00
|
||||
# GFX1250-REAL16: v_cvt_pk_bf8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
|
||||
# GFX1250-FAKE16: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
|
||||
|
||||
0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00
|
||||
# GFX1250: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00
|
||||
# GFX1250: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00]
|
||||
|
||||
0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20
|
||||
# GFX1250: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
|
||||
|
||||
0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00
|
||||
# GFX1250: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
|
||||
|
||||
0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00
|
||||
# GFX1250: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00]
|
||||
|
||||
0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20
|
||||
# GFX1250: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
|
||||
|
@ -210,6 +210,17 @@
|
||||
0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
|
||||
# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
|
||||
|
||||
0x01,0x82,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
|
||||
# GFX1250-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x82,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
|
||||
# GFX1250-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x82,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
|
||||
|
||||
0x01,0xc2,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
|
||||
# GFX1250-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.h, -v2, |v3| op_sel:[0,0,1] clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0xc2,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
|
||||
# GFX1250-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| op_sel:[0,0,1] clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0xc2,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
|
||||
|
||||
0x01,0x80,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
|
||||
# GFX1250: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 clamp quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x80,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
|
||||
|
||||
0xff,0x83,0x6e,0xd7,0xfa,0xfe,0xf7,0x7b,0xff,0x6f,0x05,0x30
|
||||
# GFX1250: v_cvt_sr_pk_bf16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x83,0x6e,0xd7,0xfa,0xfe,0xf7,0x7b,0xff,0x6f,0x05,0x30]
|
||||
|
||||
|
@ -146,6 +146,17 @@
|
||||
0x05,0x00,0x6d,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05
|
||||
# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
|
||||
|
||||
0x05,0x80,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21
|
||||
# GFX1250-REAL16: v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
|
||||
# GFX1250-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
|
||||
|
||||
0x05,0xc0,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21
|
||||
# GFX1250-REAL16: v_cvt_pk_fp8_f32_e64_dpp v5.h, v1, v2 op_sel:[0,0,1] clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0xc0,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
|
||||
# GFX1250-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 op_sel:[0,0,1] clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0xc0,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
|
||||
|
||||
0x05,0x80,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21
|
||||
# GFX1250: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 clamp dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x80,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
|
||||
|
||||
0xff,0x83,0x6e,0xd7,0xe9,0xfe,0xf7,0x7b,0xff,0x00,0x00,0x00
|
||||
# GFX1250: v_cvt_sr_pk_bf16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x83,0x6e,0xd7,0xe9,0xfe,0xf7,0x7b,0xff,0x00,0x00,0x00]
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user