AMDGPU: Support v_wmma_f32_16x16x128_f8f6f4 on gfx1250 (#149684)

Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
This commit is contained in:
Changpeng Fang 2025-07-21 10:09:42 -07:00 committed by GitHub
parent 0fa515f733
commit d6094370cb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
31 changed files with 1696 additions and 34 deletions

View File

@ -705,6 +705,7 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8, "V8hV16iV16iIsV8hIbI
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4, "V8fIiV16iIiV16iIsV8f", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")

View File

@ -855,6 +855,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
@ -1118,6 +1119,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
ArgsForMatchingMatrixTypes = {4, 1};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
ArgsForMatchingMatrixTypes = {5, 1, 3};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
ArgsForMatchingMatrixTypes = {3, 0, 1};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;

View File

@ -157,6 +157,18 @@ void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c)
*out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true);
}
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_f8f6f4(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
// CHECK-GFX1250-NEXT: [[TMP1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A:%.*]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C:%.*]])
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP1]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c);
}
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_f16(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> [[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)

View File

@ -114,6 +114,13 @@ void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c, int
*out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
}
void test_amdgcn_wmma_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c, int mod)
{
*out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(mod, a, 2, b, 0, c); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(1, a, mod, b, 0, c); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(1, a, 2, b, mod, c); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4' must be a constant integer}}
}
void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c, int mod)
{
*out = __builtin_amdgcn_wmma_f32_16x16x32_f16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}

View File

@ -3717,6 +3717,20 @@ class AMDGPUWmmaIntrinsicModsAllDiff<LLVMType DstTy, LLVMType AB, LLVMType C> :
IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
class AMDGPUWmmaIntrinsicModsC_MatrixFMT :
Intrinsic<
[llvm_anyfloat_ty], // %D
[
llvm_i32_ty, // matrix_a_fmt
llvm_anyint_ty, // %A
llvm_i32_ty, // matrix_b_fmt
llvm_anyint_ty, // %B
llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
LLVMMatchType<0>, // %C
],
[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
@ -3741,6 +3755,7 @@ def int_amdgcn_wmma_f32_16x16x128_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint
def int_amdgcn_wmma_f32_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_i32_16x16x64_iu8 : AMDGPUWmmaIntrinsicModsAB<llvm_anyint_ty, llvm_anyint_ty>;
def int_amdgcn_wmma_f32_16x16x128_f8f6f4 : AMDGPUWmmaIntrinsicModsC_MatrixFMT;
def int_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUWmmaIntrinsicF4ModsC<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty>;
}

View File

@ -6668,6 +6668,54 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"invalid vector type for format", &Call, Src1, Call.getArgOperand(5));
break;
}
case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
Value *Src0 = Call.getArgOperand(1);
Value *Src1 = Call.getArgOperand(3);
unsigned FmtA = cast<ConstantInt>(Call.getArgOperand(0))->getZExtValue();
unsigned FmtB = cast<ConstantInt>(Call.getArgOperand(2))->getZExtValue();
Check(FmtA <= 4, "invalid value for matrix format", Call,
Call.getArgOperand(0));
Check(FmtB <= 4, "invalid value for matrix format", Call,
Call.getArgOperand(2));
// AMDGPU::MatrixFMT values
auto getFormatNumRegs = [](unsigned FormatVal) {
switch (FormatVal) {
case 0:
case 1:
return 16u;
case 2:
case 3:
return 12u;
case 4:
return 8u;
default:
llvm_unreachable("invalid format value");
}
};
auto isValidSrcASrcBVector = [](FixedVectorType *Ty) {
if (!Ty || !Ty->getElementType()->isIntegerTy(32))
return false;
unsigned NumElts = Ty->getNumElements();
return NumElts == 16 || NumElts == 12 || NumElts == 8;
};
auto *Src0Ty = dyn_cast<FixedVectorType>(Src0->getType());
auto *Src1Ty = dyn_cast<FixedVectorType>(Src1->getType());
Check(isValidSrcASrcBVector(Src0Ty),
"operand 1 must be 8, 12 or 16 element i32 vector", &Call, Src0);
Check(isValidSrcASrcBVector(Src1Ty),
"operand 3 must be 8, 12 or 16 element i32 vector", &Call, Src1);
// Permit excess registers for the format.
Check(Src0Ty->getNumElements() >= getFormatNumRegs(FmtA),
"invalid vector type for format", &Call, Src0, Call.getArgOperand(0));
Check(Src1Ty->getNumElements() >= getFormatNumRegs(FmtB),
"invalid vector type for format", &Call, Src1, Call.getArgOperand(2));
break;
}
case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: {
Value *V = Call.getArgOperand(0);

View File

@ -1694,6 +1694,47 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
NewII->takeName(&II);
return IC.replaceInstUsesWith(II, NewII);
}
case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
Value *Src0 = II.getArgOperand(1);
Value *Src1 = II.getArgOperand(3);
unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
bool MadeChange = false;
unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
// Depending on the used format, fewer registers are required so shrink the
// vector type.
if (Src0Ty->getNumElements() > Src0NumElts) {
Src0 = IC.Builder.CreateExtractVector(
FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
IC.Builder.getInt64(0));
MadeChange = true;
}
if (Src1Ty->getNumElements() > Src1NumElts) {
Src1 = IC.Builder.CreateExtractVector(
FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
IC.Builder.getInt64(0));
MadeChange = true;
}
if (!MadeChange)
return std::nullopt;
SmallVector<Value *, 13> Args(II.args());
Args[1] = Src0;
Args[3] = Src1;
CallInst *NewII = IC.Builder.CreateIntrinsic(
IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
Args, &II);
NewII->takeName(&II);
return IC.replaceInstUsesWith(II, NewII);
}
}
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {

View File

@ -4714,6 +4714,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8:
case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:

View File

@ -176,6 +176,8 @@ public:
ImmTyWaitVAVDst,
ImmTyWaitVMVSrc,
ImmTyBitOp3,
ImmTyMatrixAFMT,
ImmTyMatrixBFMT,
ImmTyMatrixAReuse,
ImmTyMatrixBReuse,
ImmTyByteSel,
@ -423,6 +425,8 @@ public:
bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); }
bool isMatrixAFMT() const { return isImmTy(ImmTyMatrixAFMT); }
bool isMatrixBFMT() const { return isImmTy(ImmTyMatrixBFMT); }
bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); }
bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
@ -1174,6 +1178,8 @@ public:
case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
case ImmTyBitOp3: OS << "BitOp3"; break;
case ImmTyMatrixAFMT: OS << "ImmTyMatrixAFMT"; break;
case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break;
case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break;
case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break;
case ImmTyByteSel: OS << "ByteSel" ; break;
@ -1714,6 +1720,10 @@ public:
ParseStatus parseIndexKey8bit(OperandVector &Operands);
ParseStatus parseIndexKey16bit(OperandVector &Operands);
ParseStatus parseIndexKey32bit(OperandVector &Operands);
ParseStatus tryParseMatrixFMT(OperandVector &Operands, StringRef Name,
AMDGPUOperand::ImmTy Type);
ParseStatus parseMatrixAFMT(OperandVector &Operands);
ParseStatus parseMatrixBFMT(OperandVector &Operands);
ParseStatus parseDfmtNfmt(int64_t &Format);
ParseStatus parseUfmt(int64_t &Format);
@ -1849,6 +1859,7 @@ private:
const unsigned CPol);
bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
std::optional<StringRef> validateLdsDirect(const MCInst &Inst);
bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);
unsigned getConstantBusLimit(unsigned Opcode) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
@ -5409,6 +5420,37 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst,
return true;
}
bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
const OperandVector &Operands) {
unsigned Opc = Inst.getOpcode();
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
const MCInstrDesc &Desc = MII.get(Opc);
auto validateFmt = [&](AMDGPU::OpName FmtOp, AMDGPU::OpName SrcOp) -> bool {
int FmtIdx = AMDGPU::getNamedOperandIdx(Opc, FmtOp);
if (FmtIdx == -1)
return true;
unsigned Fmt = Inst.getOperand(FmtIdx).getImm();
int SrcIdx = AMDGPU::getNamedOperandIdx(Opc, SrcOp);
unsigned RegSize =
TRI->getRegClass(Desc.operands()[SrcIdx].RegClass).getSizeInBits();
if (RegSize == AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt) * 32)
return true;
static const char *FmtNames[] = {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
"MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
"MATRIX_FMT_FP4"};
Error(getRegLoc(mc2PseudoReg(Inst.getOperand(SrcIdx).getReg()), Operands),
"wrong register tuple size for " + Twine(FmtNames[Fmt]));
return false;
};
return validateFmt(AMDGPU::OpName::matrix_a_fmt, AMDGPU::OpName::src0) &&
validateFmt(AMDGPU::OpName::matrix_b_fmt, AMDGPU::OpName::src1);
}
bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
const SMLoc &IDLoc,
const OperandVector &Operands) {
@ -5542,6 +5584,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
if (!validateTFE(Inst, Operands)) {
return false;
}
if (!validateWMMA(Inst, Operands)) {
return false;
}
return true;
}
@ -7215,6 +7260,26 @@ ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) {
return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey32bit);
}
ParseStatus AMDGPUAsmParser::tryParseMatrixFMT(OperandVector &Operands,
StringRef Name,
AMDGPUOperand::ImmTy Type) {
return parseStringOrIntWithPrefix(Operands, Name,
{"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
"MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
"MATRIX_FMT_FP4"},
Type);
}
ParseStatus AMDGPUAsmParser::parseMatrixAFMT(OperandVector &Operands) {
return tryParseMatrixFMT(Operands, "matrix_a_fmt",
AMDGPUOperand::ImmTyMatrixAFMT);
}
ParseStatus AMDGPUAsmParser::parseMatrixBFMT(OperandVector &Operands) {
return tryParseMatrixFMT(Operands, "matrix_b_fmt",
AMDGPUOperand::ImmTyMatrixBFMT);
}
// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
// values to live in a joint format operand in the MCInst encoding.
ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
@ -9316,6 +9381,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
DefaultVal);
}
int MatrixAFMTIdx =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_fmt);
if (MatrixAFMTIdx != -1) {
addOptionalImmOperand(Inst, Operands, OptIdx,
AMDGPUOperand::ImmTyMatrixAFMT, 0);
}
int MatrixBFMTIdx =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_fmt);
if (MatrixBFMTIdx != -1) {
addOptionalImmOperand(Inst, Operands, OptIdx,
AMDGPUOperand::ImmTyMatrixBFMT, 0);
}
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse))
addOptionalImmOperand(Inst, Operands, OptIdx,
AMDGPUOperand::ImmTyMatrixAReuse, 0);

View File

@ -877,6 +877,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsMAI)
convertMAIInst(MI);
if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsWMMA)
convertWMMAInst(MI);
int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vdst_in);
if (VDstIn_Idx != -1) {
@ -974,10 +977,23 @@ static void adjustMFMA_F8F6F4OpRegClass(const MCRegisterInfo &MRI,
return MO.setReg(
MRI.getSubReg(MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5));
case 8:
if (MCRegister NewReg = MRI.getSubReg(
MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7)) {
MO.setReg(NewReg);
}
return;
case 12: {
// There is no 384-bit subreg index defined.
MCRegister BaseReg = MRI.getSubReg(MO.getReg(), AMDGPU::sub0);
MCRegister NewReg = MRI.getMatchingSuperReg(
BaseReg, AMDGPU::sub0, &MRI.getRegClass(AMDGPU::VReg_384RegClassID));
return MO.setReg(NewReg);
}
case 16:
// No-op in cases where one operand is still f8/bf8.
return;
default:
llvm_unreachable("Unexpected size for mfma f8f6f4 operand");
llvm_unreachable("Unexpected size for mfma/wmma f8f6f4 operand");
}
}
@ -1015,6 +1031,35 @@ void AMDGPUDisassembler::convertMAIInst(MCInst &MI) const {
AdjustedRegClassOpcode->NumRegsSrcB);
}
void AMDGPUDisassembler::convertWMMAInst(MCInst &MI) const {
int FmtAIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::matrix_a_fmt);
if (FmtAIdx == -1)
return;
int FmtBIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::matrix_b_fmt);
unsigned FmtA = MI.getOperand(FmtAIdx).getImm();
unsigned FmtB = MI.getOperand(FmtBIdx).getImm();
const AMDGPU::MFMA_F8F6F4_Info *AdjustedRegClassOpcode =
AMDGPU::getWMMA_F8F6F4_WithFormatArgs(FmtA, FmtB, MI.getOpcode());
if (!AdjustedRegClassOpcode ||
AdjustedRegClassOpcode->Opcode == MI.getOpcode())
return;
MI.setOpcode(AdjustedRegClassOpcode->Opcode);
int Src0Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
int Src1Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src0Idx),
AdjustedRegClassOpcode->NumRegsSrcA);
adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src1Idx),
AdjustedRegClassOpcode->NumRegsSrcB);
}
struct VOPModifiers {
unsigned OpSel = 0;
unsigned OpSelHi = 0;

View File

@ -161,6 +161,7 @@ public:
void convertFMAanyK(MCInst &MI) const;
void convertSDWAInst(MCInst &MI) const;
void convertMAIInst(MCInst &MI) const;
void convertWMMAInst(MCInst &MI) const;
void convertDPP8Inst(MCInst &MI) const;
void convertMIMGInst(MCInst &MI) const;
void convertVOP3DPPInst(MCInst &MI) const;

View File

@ -1345,6 +1345,48 @@ void AMDGPUInstPrinter::printIndexKey32bit(const MCInst *MI, unsigned OpNo,
O << " index_key:" << Imm;
}
void AMDGPUInstPrinter::printMatrixFMT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O, char AorB) {
auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
if (Imm == 0)
return;
O << " matrix_" << AorB << "_fmt:";
switch (Imm) {
default:
O << Imm;
break;
case WMMA::MatrixFMT::MATRIX_FMT_FP8:
O << "MATRIX_FMT_FP8";
break;
case WMMA::MatrixFMT::MATRIX_FMT_BF8:
O << "MATRIX_FMT_BF8";
break;
case WMMA::MatrixFMT::MATRIX_FMT_FP6:
O << "MATRIX_FMT_FP6";
break;
case WMMA::MatrixFMT::MATRIX_FMT_BF6:
O << "MATRIX_FMT_BF6";
break;
case WMMA::MatrixFMT::MATRIX_FMT_FP4:
O << "MATRIX_FMT_FP4";
break;
}
}
void AMDGPUInstPrinter::printMatrixAFMT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
printMatrixFMT(MI, OpNo, STI, O, 'a');
}
void AMDGPUInstPrinter::printMatrixBFMT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
printMatrixFMT(MI, OpNo, STI, O, 'b');
}
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {

View File

@ -134,6 +134,12 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printIndexKey32bit(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printMatrixFMT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O, char AorB);
void printMatrixAFMT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printMatrixBFMT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpSlot(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpAttr(const MCInst *MI, unsigned OpNo,

View File

@ -384,6 +384,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
if (((Desc.TSFlags & SIInstrFlags::VOP3P) ||
Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) &&
// Matrix B format operand reuses op_sel_hi.
!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_fmt) &&
// Matrix B reuse operand reuses op_sel_hi.
!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) {
Encoding |= getImplicitOpSelHiEncoding(Opcode);

View File

@ -1005,6 +1005,16 @@ enum Target : unsigned {
} // namespace Exp
namespace WMMA {
enum MatrixFMT : unsigned {
MATRIX_FMT_FP8 = 0,
MATRIX_FMT_BF8 = 1,
MATRIX_FMT_FP6 = 2,
MATRIX_FMT_BF6 = 3,
MATRIX_FMT_FP4 = 4
};
} // namespace WMMA
namespace VOP3PEncoding {
enum OpSel : uint64_t {

View File

@ -1307,6 +1307,9 @@ let PrintMethod = "printBitOp3" in
def BitOp3 : NamedIntOperand<"bitop3">;
def bitop3_0 : DefaultOperand<BitOp3, 0>;
def MatrixAFMT : CustomOperand<i32, 1, "MatrixAFMT">;
def MatrixBFMT : CustomOperand<i32, 1, "MatrixBFMT">;
def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">;
def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">;
@ -1882,6 +1885,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
!eq(VT, v4bf16) : AVSrc_64,
!eq(VT.Size, 1024) : VRegSrc_1024,
!eq(VT.Size, 512) : VRegSrc_512,
!eq(VT.Size, 384) : VRegSrc_384,
!eq(VT.Size, 256) : VRegSrc_256,
!eq(VT.Size, 192) : VRegSrc_192,
!eq(VT.Size, 128) : VRegSrc_128,
@ -1894,6 +1898,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
class getVOP3VRegSrcForVT<ValueType VT> {
RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
!eq(VT.Size, 512) : VRegSrc_512,
!eq(VT.Size, 384) : VRegSrc_384,
!eq(VT.Size, 256) : VRegSrc_256,
!eq(VT.Size, 192) : VRegSrc_192,
!eq(VT.Size, 128) : VRegSrc_128,
@ -2666,6 +2671,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
HasOMod);
field bit HasNeg = HasModifiers;
field bit HasMatrixReuse = 0;
field bit HasMatrixFMT = 0;
field bit HasSrc0Mods = HasModifiers;
field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0);

View File

@ -1207,6 +1207,7 @@ def VRegSrc_96 : SrcReg9<VReg_96>;
def VRegSrc_128: SrcReg9<VReg_128>;
def VRegSrc_192: SrcReg9<VReg_192>;
def VRegSrc_256: SrcReg9<VReg_256>;
def VRegSrc_384: SrcReg9<VReg_384>;
def VRegSrc_512: SrcReg9<VReg_512>;
def VRegSrc_1024: SrcReg9<VReg_1024>;
def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>;

View File

@ -464,6 +464,20 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
} // End SchedModel = GFX12SpeedModel
// Check if any matrix inputs are interpreted as f8 in an f8f6f4
// wmma instruction.
def PredIsF8_WMMA_SCALE : SchedPredicate<[{
TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_a_fmt)->getImm() <= AMDGPU::WMMA::MATRIX_FMT_BF8 ||
TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_b_fmt)->getImm() <= AMDGPU::WMMA::MATRIX_FMT_BF8
}]>;
// If either matrix format is f8, the instruction takes 2x as many
// cycles. TODO: This isn't reflected in MCA.
def WriteWMMAScale_16X16X128_F8F6F4 : SchedWriteVariant<[
SchedVar<PredIsF8_WMMA_SCALE, [WriteXDL4PassWMMA]>,
SchedVar<NoSchedPred, [WriteXDL2PassWMMA]>
]>;
multiclass GFX125xCommonWriteRes {
let ReleaseAtCycles = [8] in
@ -495,6 +509,7 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>;
def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>;
def : InstRW<[WriteWMMAScale_16X16X128_F8F6F4], (instregex "^V_WMMA_.*_16X16X128_F8F6F4.*_w32")>;
def : InstRW<[Write4PassWMMA], (instregex "^V_WMMA_F32_16X16X4_F32_w32")>;
def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>;
} // End GFX125xCommonWriteRes

View File

@ -598,6 +598,29 @@ const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode);
}
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt) {
switch (Fmt) {
case WMMA::MATRIX_FMT_FP8:
case WMMA::MATRIX_FMT_BF8:
return 16;
case WMMA::MATRIX_FMT_FP6:
case WMMA::MATRIX_FMT_BF6:
return 12;
case WMMA::MATRIX_FMT_FP4:
return 8;
}
llvm_unreachable("covered switch over wmma scale formats");
}
const MFMA_F8F6F4_Info *getWMMA_F8F6F4_WithFormatArgs(unsigned FmtA,
unsigned FmtB,
unsigned F8F8Opcode) {
uint8_t SrcANumRegs = wmmaScaleF8F6F4FormatToNumRegs(FmtA);
uint8_t SrcBNumRegs = wmmaScaleF8F6F4FormatToNumRegs(FmtB);
return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode);
}
unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST) {
if (ST.hasFeature(AMDGPU::FeatureGFX1250Insts))
return SIEncodingFamily::GFX1250;

View File

@ -627,6 +627,14 @@ const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
unsigned BLGP,
unsigned F8F8Opcode);
LLVM_READNONE
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt);
LLVM_READONLY
const MFMA_F8F6F4_Info *getWMMA_F8F6F4_WithFormatArgs(unsigned FmtA,
unsigned FmtB,
unsigned F8F8Opcode);
LLVM_READONLY
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,

View File

@ -1318,13 +1318,15 @@ let WaveSizePredicate = isWave64 in {
class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0,
bit _HasMatrixReuse = 0, bit _IsF4 = 0>
bit _HasMatrixFMT = 0, bit _HasMatrixReuse = 0,
bit _IsF4 = 0>
: VOP3P_Profile<VOPProfile<ArgTy>> {
bit IsIU = _IsIU;
bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B
bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32));
int IndexType = _IndexType;
let HasMatrixFMT = _HasMatrixFMT;
let HasMatrixReuse = _HasMatrixReuse;
bit HasIModOp = _Has_ImodOp;
@ -1422,7 +1424,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
!eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
!eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit),
!eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit));
dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt),
(ins));
dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins));
dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins));
dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
@ -1436,7 +1439,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
(ins VRegSrc_64:$src2),
(ins VRegSrc_32:$src2)),
IndexKey)),
MatrixReuse, Clamp, Neg);
MatrixFMT, MatrixReuse, Clamp, Neg);
// asm
@ -1444,13 +1447,14 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
!eq(IndexType, 8) : "$index_key_8bit",
!eq(IndexType, 16) : "$index_key_16bit",
!eq(IndexType, 32) : "$index_key_32bit");
string MatrxFMTAsm = !if(HasMatrixFMT, "$matrix_a_fmt$matrix_b_fmt", "");
string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", "");
string ClampAsm = !if(HasClamp, "$clamp", "");
string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi",
!and(NegLoAny, !not(NegHiAny)) : "$neg_lo",
!and(!not(NegLoAny), !not(NegHiAny)) : "");
let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixReuseAsm#NegAsm#ClampAsm;
let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrxFMTAsm#MatrixReuseAsm#NegAsm#ClampAsm;
// isel patterns
bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp));
@ -1462,6 +1466,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
IsAB_BF16_IMod0 : (ins Src0VT:$src0),
IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
HasMatrixFMT : (ins timm:$matrix_a_fmt, Src0VT:$src0),
NoABMods : (ins Src0VT:$src0));
dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
@ -1474,6 +1479,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
IsAB_BF16_IMod0 : (ins Src1VT:$src1),
IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
HasMatrixFMT : (ins timm:$matrix_b_fmt, Src1VT:$src1),
NoABMods : (ins Src1VT:$src1));
dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
@ -1499,7 +1505,6 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
IsIUXF32 : (ins Src2VT:$src2),
IsSWMMAC : (ins));
dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins));
dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
!eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
!eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))),
@ -1508,6 +1513,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
!eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
!eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit),
!eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit));
dag MatrixFMTOutPat = !if(HasMatrixFMT, (ins i32:$matrix_a_fmt, i32:$matrix_b_fmt), (ins));
dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2))));
dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2));
@ -1515,7 +1521,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins));
dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat);
dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixReuseOutModPat, ClampPat);
dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat);
dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat);
dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat);
@ -1523,7 +1529,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
// wmma pattern where src2 is inline imm uses _threeaddr pseudo,
// can't use _twoaddr since it would violate src2 tied to vdst constraint.
dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat);
dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixReuseOutModPat, ClampPat);
dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat);
}
def WMMAInstInfoTable : GenericTable {
@ -1632,26 +1638,45 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1,
// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
// for matrix A, index is i16; Matrix B uses all lanes
def F64_F64X4_WMMA_w32 : VOP3PWMMA_Profile<[v8f64, v2f64, v2f64, v8f64], 0, 0, 0, 0, 1>;
def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 1>;
def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 1>;
def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 1>;
def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 1>;
def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1>;
def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1>;
def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 1>;
def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 1>;
def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 1>;
def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 1>;
def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 1>;
def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 1>;
def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 1>;
def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 1>;
def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 1>;
def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 1>;
def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 1>;
def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 1>;
def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 1>;
def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 1>;
def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 1>;
def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 1>;
def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 1>;
def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 1>;
def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 1>;
def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 1>;
def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 1>;
def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 1>;
def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 1>;
def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 1>;
def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 1>;
def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 1>;
def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 1>;
multiclass WMMA_F8F6F4_Profiles<bit HasMatrixReuse> {
def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
}
defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0>;
multiclass WMMAInst_SrcFormats_mc<string OpName, string Profile> {
foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
defm _#I#_w32 : WMMAInstGFX12<OpName # "_" # I # "_w32", !cast<VOP3PWMMA_Profile>(Profile # "_" # I # "_w32"), "_w32">;
}
}
let WaveSizePredicate = isWave32 in {
let SubtargetPredicate = isGFX125xOnly in {
@ -1697,6 +1722,8 @@ defm V_SWMMAC_I32_16X16X128_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x12
defm V_SWMMAC_F32_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_f16", F32_F16X64_SWMMAC_w32, "_w32">;
defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16", F16_F16X64_SWMMAC_w32, "_w32">;
defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">;
} // End is_wmma_xdl = 1.
} // End SubtargetPredicate = isGFX125xOnly
@ -1854,6 +1881,10 @@ let SubtargetPredicate = isGFX125xOnly in {
defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>;
foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>;
}
def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x64_bf16, BF16_BF16X64_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
@ -1912,17 +1943,22 @@ multiclass VOP3P_Real_Base<GFXGen Gen, bits<8> op, string backing_ps_name = NAME
class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
: VOP3Pe_gfx11_gfx12<op, P>{
// opsel
let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0,
let Inst{11} = !cond(WMMAP.HasMatrixFMT : matrix_a_fmt{0},
!eq(WMMAP.IndexType, 0) : 0,
!eq(WMMAP.IndexType, 8) : index_key_8bit{0},
!eq(WMMAP.IndexType, 16) : index_key_16bit{0},
!eq(WMMAP.IndexType, 32) : index_key_32bit{0});
let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
let Inst{13} = !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0);
let Inst{12} = !if(WMMAP.HasMatrixFMT, matrix_a_fmt{1},
!if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0));
let Inst{13} = !if (WMMAP.HasMatrixFMT, matrix_a_fmt{2},
!if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0));
// opsel_hi
let Inst{59} = 1;
let Inst{60} = 1;
let Inst{14} = !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1);
let Inst{59} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{0}, 1);
let Inst{60} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{1}, 1);
let Inst{14} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{2},
!if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1));
// neg_lo
let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
@ -1961,6 +1997,24 @@ multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
}
}
multiclass VOP3P_Real_WMMA_F8F6F4_gfx1250<bits<8> op, VOP3PWMMA_Profile WMMAP> {
defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
let AsmString = asmName # PS.AsmOperands in
defm NAME : VOP3P_Real_WMMA_gfx1250<op, WMMAP>,
MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_twoaddr_gfx1250">;
}
multiclass VOP3P_Real_WMMA_gfx1250_SrcFormats<bits<8> op, string WMMAP> {
defm _f8_f8_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
let isAsmParserOnly = true in { // Disable ambiguous disassembly.
defm _#I#_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
}
}
}
defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
@ -2035,6 +2089,8 @@ defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8B
defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>;
defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>;
defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x033, "F32_16X16X128_F8F6F4">;
defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
defm V_SWMMAC_F16_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>;

View File

@ -453,6 +453,8 @@ class VOP3Pe_Base {
bits<2> index_key_8bit;
bits<1> index_key_16bit;
bits<1> index_key_32bit;
bits<3> matrix_a_fmt;
bits<3> matrix_b_fmt;
bits<1> matrix_a_reuse;
bits<1> matrix_b_reuse;
}

View File

@ -302,6 +302,14 @@ define amdgpu_kernel void @wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
define amdgpu_ps void @wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %tmp0, ptr addrspace(1) %out
ret void
}
; CHRCK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 false, <16 x half> %A, i1 false, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
define amdgpu_ps void @swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
@ -836,6 +844,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>,
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1)

View File

@ -304,6 +304,556 @@ bb:
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 1, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off
; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 2, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off
; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off
; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off
; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off
; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v8i32(i32 4, <8 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
; GFX1250: ; %bb.0: ; %bb
@ -815,6 +1365,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>,
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
@ -824,6 +1375,7 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1)
declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)

View File

@ -1342,6 +1342,110 @@ bb:
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34
; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s1, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
; GFX1250: ; %bb.0: ; %bb
@ -2227,6 +2331,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>,
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)

View File

@ -1126,6 +1126,72 @@ bb:
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 1, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 3, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 4, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC:
; GFX1250: ; %bb.0: ; %bb
@ -1967,6 +2033,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>,
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)

View File

@ -923,6 +923,71 @@ v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47]
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1]
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1]
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19]
// GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32

View File

@ -363,6 +363,82 @@ v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:2
v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,0,1]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[1,0,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,1,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[1,0,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,1,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] clamp
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:-1
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid matrix_b_fmt value
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:xxx
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid matrix_b_fmt value
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47]
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP8
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP8
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF8
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP6
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF6
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP4
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP8
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP8
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF8
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP6
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF6
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP4
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]

View File

@ -364,6 +364,45 @@
0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c
# GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c]
0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04]
0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04]
0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04]
0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04]
0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14]
0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c]
0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14]
0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03]
0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04]
0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c]
0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04]
0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84]
0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b
# GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b]

View File

@ -0,0 +1,158 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s
; ------------------------------------------------------------------------------------
; Incorrect signature for format cases (IR vector too large) wmma.f32.16x16x128.f8f6f4
; ------------------------------------------------------------------------------------
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8(
; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: ret void
;
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 2, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6(
; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: ret void
;
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8(
; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> [[TMP0]], i32 1, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: ret void
;
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6(
; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 3, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: ret void
;
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 3, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8(
; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: ret void
;
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4(
; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: ret void
;
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8(
; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: ret void
;
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4(
; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: ret void
;
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6(
; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> [[TMP0]], i32 2, <12 x i32> [[TMP1]], i16 0, <8 x float> [[C]])
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: ret void
;
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4(
; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> [[TMP0]], i32 4, <8 x i32> [[TMP1]], i16 0, <8 x float> [[C]])
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: ret void
;
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}

View File

@ -0,0 +1,165 @@
; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
; --------------------------------------------------------------------
; Wrong mangled types
; --------------------------------------------------------------------
; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: <16 x i64> %A
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i64_fp8___v16i32_fp8(<16 x i64> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: <16 x i64> %B
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i64_fp8(<16 x i32> %A, <16 x i64> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------
; Impossible vector types
; --------------------------------------------------------------------
; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v9i32.v16i32(i32 0, <9 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: <9 x i32> %A
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v9i32_fp8___v16i32_fp8(<9 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v9i32.v16i32(i32 0, <9 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v9i32(i32 0, <16 x i32> %A, i32 0, <9 x i32> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: <9 x i32> %B
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v9i32_fp8(<16 x i32> %A, <9 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v9i32(i32 0, <16 x i32> %A, i32 0, <9 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------
; Out of bounds format
; --------------------------------------------------------------------
; CHECK: invalid value for matrix format
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 9999, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: i32 9999
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_invalid0___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 9999, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; CHECK: invalid value for matrix format
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 9999, <16 x i32> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: i32 9999
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_invalid1(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 9999, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------
; Incorrect signature for format cases (IR vector too small)
; --------------------------------------------------------------------
; CHECK: invalid vector type for format
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: <8 x i32> %A
; CHECK-NEXT: i32 0
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v8i32_fp8___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; CHECK: invalid vector type for format
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 0, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: <12 x i32> %A
; CHECK-NEXT: i32 0
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp8___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 0, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; CHECK: invalid vector type for format
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 1, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: <8 x i32> %A
; CHECK-NEXT: i32 1
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v8i32_bf8___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 1, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; CHECK: invalid vector type for format
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 1, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: <12 x i32> %A
; CHECK-NEXT: i32 1
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_bf8___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 1, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; CHECK: invalid vector type for format
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 0, <8 x i32> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: <8 x i32> %B
; CHECK-NEXT: i32 0
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_fp8(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 0, <8 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; CHECK: invalid vector type for format
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 0, <12 x i32> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: <12 x i32> %B
; CHECK-NEXT: i32 0
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp8(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 0, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; CHECK: invalid vector type for format
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 1, <8 x i32> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: <8 x i32> %B
; CHECK-NEXT: i32 1
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_bf8(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 1, <8 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; CHECK: invalid vector type for format
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 1, <12 x i32> %B, i16 0, <8 x float> %C)
; CHECK-NEXT: <12 x i32> %B
; CHECK-NEXT: i32 1
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_bf8(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 1, <12 x i32> %B, i16 0, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}