AMDGPU: Support v_wmma_f32_16x16x128_f8f6f4 on gfx1250 (#149684)
Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
This commit is contained in:
parent
0fa515f733
commit
d6094370cb
@ -705,6 +705,7 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8, "V8hV16iV16iIsV8hIbI
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4, "V8fIiV16iIiV16iIsV8f", "nc", "gfx1250-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
|
||||
|
@ -855,6 +855,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
|
||||
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
|
||||
@ -1118,6 +1119,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
||||
ArgsForMatchingMatrixTypes = {4, 1};
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
|
||||
ArgsForMatchingMatrixTypes = {5, 1, 3};
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
|
||||
ArgsForMatchingMatrixTypes = {3, 0, 1};
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
|
||||
|
@ -157,6 +157,18 @@ void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c)
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true);
|
||||
}
|
||||
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_f8f6f4(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
|
||||
// CHECK-GFX1250-NEXT: [[TMP1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A:%.*]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C:%.*]])
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP1]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_f16(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> [[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
|
||||
|
@ -114,6 +114,13 @@ void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c, int
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_amdgcn_wmma_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c, int mod)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(mod, a, 2, b, 0, c); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(1, a, mod, b, 0, c); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(1, a, 2, b, mod, c); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c, int mod)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x32_f16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
|
||||
|
@ -3717,6 +3717,20 @@ class AMDGPUWmmaIntrinsicModsAllDiff<LLVMType DstTy, LLVMType AB, LLVMType C> :
|
||||
IntrWillReturn, IntrNoCallback, IntrNoFree]
|
||||
>;
|
||||
|
||||
class AMDGPUWmmaIntrinsicModsC_MatrixFMT :
|
||||
Intrinsic<
|
||||
[llvm_anyfloat_ty], // %D
|
||||
[
|
||||
llvm_i32_ty, // matrix_a_fmt
|
||||
llvm_anyint_ty, // %A
|
||||
llvm_i32_ty, // matrix_b_fmt
|
||||
llvm_anyint_ty, // %B
|
||||
llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
|
||||
LLVMMatchType<0>, // %C
|
||||
],
|
||||
[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
|
||||
>;
|
||||
|
||||
defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
|
||||
def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
|
||||
@ -3741,6 +3755,7 @@ def int_amdgcn_wmma_f32_16x16x128_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint
|
||||
def int_amdgcn_wmma_f32_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_f32_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_i32_16x16x64_iu8 : AMDGPUWmmaIntrinsicModsAB<llvm_anyint_ty, llvm_anyint_ty>;
|
||||
def int_amdgcn_wmma_f32_16x16x128_f8f6f4 : AMDGPUWmmaIntrinsicModsC_MatrixFMT;
|
||||
def int_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUWmmaIntrinsicF4ModsC<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty>;
|
||||
}
|
||||
|
||||
|
@ -6668,6 +6668,54 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
|
||||
"invalid vector type for format", &Call, Src1, Call.getArgOperand(5));
|
||||
break;
|
||||
}
|
||||
case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
|
||||
Value *Src0 = Call.getArgOperand(1);
|
||||
Value *Src1 = Call.getArgOperand(3);
|
||||
|
||||
unsigned FmtA = cast<ConstantInt>(Call.getArgOperand(0))->getZExtValue();
|
||||
unsigned FmtB = cast<ConstantInt>(Call.getArgOperand(2))->getZExtValue();
|
||||
Check(FmtA <= 4, "invalid value for matrix format", Call,
|
||||
Call.getArgOperand(0));
|
||||
Check(FmtB <= 4, "invalid value for matrix format", Call,
|
||||
Call.getArgOperand(2));
|
||||
|
||||
// AMDGPU::MatrixFMT values
|
||||
auto getFormatNumRegs = [](unsigned FormatVal) {
|
||||
switch (FormatVal) {
|
||||
case 0:
|
||||
case 1:
|
||||
return 16u;
|
||||
case 2:
|
||||
case 3:
|
||||
return 12u;
|
||||
case 4:
|
||||
return 8u;
|
||||
default:
|
||||
llvm_unreachable("invalid format value");
|
||||
}
|
||||
};
|
||||
|
||||
auto isValidSrcASrcBVector = [](FixedVectorType *Ty) {
|
||||
if (!Ty || !Ty->getElementType()->isIntegerTy(32))
|
||||
return false;
|
||||
unsigned NumElts = Ty->getNumElements();
|
||||
return NumElts == 16 || NumElts == 12 || NumElts == 8;
|
||||
};
|
||||
|
||||
auto *Src0Ty = dyn_cast<FixedVectorType>(Src0->getType());
|
||||
auto *Src1Ty = dyn_cast<FixedVectorType>(Src1->getType());
|
||||
Check(isValidSrcASrcBVector(Src0Ty),
|
||||
"operand 1 must be 8, 12 or 16 element i32 vector", &Call, Src0);
|
||||
Check(isValidSrcASrcBVector(Src1Ty),
|
||||
"operand 3 must be 8, 12 or 16 element i32 vector", &Call, Src1);
|
||||
|
||||
// Permit excess registers for the format.
|
||||
Check(Src0Ty->getNumElements() >= getFormatNumRegs(FmtA),
|
||||
"invalid vector type for format", &Call, Src0, Call.getArgOperand(0));
|
||||
Check(Src1Ty->getNumElements() >= getFormatNumRegs(FmtB),
|
||||
"invalid vector type for format", &Call, Src1, Call.getArgOperand(2));
|
||||
break;
|
||||
}
|
||||
case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
|
||||
case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: {
|
||||
Value *V = Call.getArgOperand(0);
|
||||
|
@ -1694,6 +1694,47 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
|
||||
NewII->takeName(&II);
|
||||
return IC.replaceInstUsesWith(II, NewII);
|
||||
}
|
||||
case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
|
||||
Value *Src0 = II.getArgOperand(1);
|
||||
Value *Src1 = II.getArgOperand(3);
|
||||
unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
|
||||
uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
|
||||
auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
|
||||
auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
|
||||
|
||||
bool MadeChange = false;
|
||||
unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
|
||||
unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
|
||||
|
||||
// Depending on the used format, fewer registers are required so shrink the
|
||||
// vector type.
|
||||
if (Src0Ty->getNumElements() > Src0NumElts) {
|
||||
Src0 = IC.Builder.CreateExtractVector(
|
||||
FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
|
||||
IC.Builder.getInt64(0));
|
||||
MadeChange = true;
|
||||
}
|
||||
|
||||
if (Src1Ty->getNumElements() > Src1NumElts) {
|
||||
Src1 = IC.Builder.CreateExtractVector(
|
||||
FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
|
||||
IC.Builder.getInt64(0));
|
||||
MadeChange = true;
|
||||
}
|
||||
|
||||
if (!MadeChange)
|
||||
return std::nullopt;
|
||||
|
||||
SmallVector<Value *, 13> Args(II.args());
|
||||
Args[1] = Src0;
|
||||
Args[3] = Src1;
|
||||
|
||||
CallInst *NewII = IC.Builder.CreateIntrinsic(
|
||||
IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
|
||||
Args, &II);
|
||||
NewII->takeName(&II);
|
||||
return IC.replaceInstUsesWith(II, NewII);
|
||||
}
|
||||
}
|
||||
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
|
||||
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
|
||||
|
@ -4714,6 +4714,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||
case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8:
|
||||
case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
|
||||
case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
|
||||
case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
|
||||
case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
|
||||
case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
|
||||
case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
|
||||
|
@ -176,6 +176,8 @@ public:
|
||||
ImmTyWaitVAVDst,
|
||||
ImmTyWaitVMVSrc,
|
||||
ImmTyBitOp3,
|
||||
ImmTyMatrixAFMT,
|
||||
ImmTyMatrixBFMT,
|
||||
ImmTyMatrixAReuse,
|
||||
ImmTyMatrixBReuse,
|
||||
ImmTyByteSel,
|
||||
@ -423,6 +425,8 @@ public:
|
||||
bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
|
||||
bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
|
||||
bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); }
|
||||
bool isMatrixAFMT() const { return isImmTy(ImmTyMatrixAFMT); }
|
||||
bool isMatrixBFMT() const { return isImmTy(ImmTyMatrixBFMT); }
|
||||
bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); }
|
||||
bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); }
|
||||
bool isTFE() const { return isImmTy(ImmTyTFE); }
|
||||
@ -1174,6 +1178,8 @@ public:
|
||||
case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
|
||||
case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
|
||||
case ImmTyBitOp3: OS << "BitOp3"; break;
|
||||
case ImmTyMatrixAFMT: OS << "ImmTyMatrixAFMT"; break;
|
||||
case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break;
|
||||
case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break;
|
||||
case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break;
|
||||
case ImmTyByteSel: OS << "ByteSel" ; break;
|
||||
@ -1714,6 +1720,10 @@ public:
|
||||
ParseStatus parseIndexKey8bit(OperandVector &Operands);
|
||||
ParseStatus parseIndexKey16bit(OperandVector &Operands);
|
||||
ParseStatus parseIndexKey32bit(OperandVector &Operands);
|
||||
ParseStatus tryParseMatrixFMT(OperandVector &Operands, StringRef Name,
|
||||
AMDGPUOperand::ImmTy Type);
|
||||
ParseStatus parseMatrixAFMT(OperandVector &Operands);
|
||||
ParseStatus parseMatrixBFMT(OperandVector &Operands);
|
||||
|
||||
ParseStatus parseDfmtNfmt(int64_t &Format);
|
||||
ParseStatus parseUfmt(int64_t &Format);
|
||||
@ -1849,6 +1859,7 @@ private:
|
||||
const unsigned CPol);
|
||||
bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
|
||||
std::optional<StringRef> validateLdsDirect(const MCInst &Inst);
|
||||
bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);
|
||||
unsigned getConstantBusLimit(unsigned Opcode) const;
|
||||
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
|
||||
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
|
||||
@ -5409,6 +5420,37 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
|
||||
const OperandVector &Operands) {
|
||||
unsigned Opc = Inst.getOpcode();
|
||||
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
|
||||
const MCInstrDesc &Desc = MII.get(Opc);
|
||||
|
||||
auto validateFmt = [&](AMDGPU::OpName FmtOp, AMDGPU::OpName SrcOp) -> bool {
|
||||
int FmtIdx = AMDGPU::getNamedOperandIdx(Opc, FmtOp);
|
||||
if (FmtIdx == -1)
|
||||
return true;
|
||||
unsigned Fmt = Inst.getOperand(FmtIdx).getImm();
|
||||
int SrcIdx = AMDGPU::getNamedOperandIdx(Opc, SrcOp);
|
||||
unsigned RegSize =
|
||||
TRI->getRegClass(Desc.operands()[SrcIdx].RegClass).getSizeInBits();
|
||||
|
||||
if (RegSize == AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt) * 32)
|
||||
return true;
|
||||
|
||||
static const char *FmtNames[] = {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
|
||||
"MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
|
||||
"MATRIX_FMT_FP4"};
|
||||
|
||||
Error(getRegLoc(mc2PseudoReg(Inst.getOperand(SrcIdx).getReg()), Operands),
|
||||
"wrong register tuple size for " + Twine(FmtNames[Fmt]));
|
||||
return false;
|
||||
};
|
||||
|
||||
return validateFmt(AMDGPU::OpName::matrix_a_fmt, AMDGPU::OpName::src0) &&
|
||||
validateFmt(AMDGPU::OpName::matrix_b_fmt, AMDGPU::OpName::src1);
|
||||
}
|
||||
|
||||
bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
|
||||
const SMLoc &IDLoc,
|
||||
const OperandVector &Operands) {
|
||||
@ -5542,6 +5584,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
|
||||
if (!validateTFE(Inst, Operands)) {
|
||||
return false;
|
||||
}
|
||||
if (!validateWMMA(Inst, Operands)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -7215,6 +7260,26 @@ ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) {
|
||||
return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey32bit);
|
||||
}
|
||||
|
||||
ParseStatus AMDGPUAsmParser::tryParseMatrixFMT(OperandVector &Operands,
|
||||
StringRef Name,
|
||||
AMDGPUOperand::ImmTy Type) {
|
||||
return parseStringOrIntWithPrefix(Operands, Name,
|
||||
{"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
|
||||
"MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
|
||||
"MATRIX_FMT_FP4"},
|
||||
Type);
|
||||
}
|
||||
|
||||
ParseStatus AMDGPUAsmParser::parseMatrixAFMT(OperandVector &Operands) {
|
||||
return tryParseMatrixFMT(Operands, "matrix_a_fmt",
|
||||
AMDGPUOperand::ImmTyMatrixAFMT);
|
||||
}
|
||||
|
||||
ParseStatus AMDGPUAsmParser::parseMatrixBFMT(OperandVector &Operands) {
|
||||
return tryParseMatrixFMT(Operands, "matrix_b_fmt",
|
||||
AMDGPUOperand::ImmTyMatrixBFMT);
|
||||
}
|
||||
|
||||
// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
|
||||
// values to live in a joint format operand in the MCInst encoding.
|
||||
ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
|
||||
@ -9316,6 +9381,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
|
||||
DefaultVal);
|
||||
}
|
||||
|
||||
int MatrixAFMTIdx =
|
||||
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_fmt);
|
||||
if (MatrixAFMTIdx != -1) {
|
||||
addOptionalImmOperand(Inst, Operands, OptIdx,
|
||||
AMDGPUOperand::ImmTyMatrixAFMT, 0);
|
||||
}
|
||||
|
||||
int MatrixBFMTIdx =
|
||||
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_fmt);
|
||||
if (MatrixBFMTIdx != -1) {
|
||||
addOptionalImmOperand(Inst, Operands, OptIdx,
|
||||
AMDGPUOperand::ImmTyMatrixBFMT, 0);
|
||||
}
|
||||
|
||||
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse))
|
||||
addOptionalImmOperand(Inst, Operands, OptIdx,
|
||||
AMDGPUOperand::ImmTyMatrixAReuse, 0);
|
||||
|
@ -877,6 +877,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
|
||||
if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsMAI)
|
||||
convertMAIInst(MI);
|
||||
|
||||
if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsWMMA)
|
||||
convertWMMAInst(MI);
|
||||
|
||||
int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
|
||||
AMDGPU::OpName::vdst_in);
|
||||
if (VDstIn_Idx != -1) {
|
||||
@ -974,10 +977,23 @@ static void adjustMFMA_F8F6F4OpRegClass(const MCRegisterInfo &MRI,
|
||||
return MO.setReg(
|
||||
MRI.getSubReg(MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5));
|
||||
case 8:
|
||||
if (MCRegister NewReg = MRI.getSubReg(
|
||||
MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7)) {
|
||||
MO.setReg(NewReg);
|
||||
}
|
||||
return;
|
||||
case 12: {
|
||||
// There is no 384-bit subreg index defined.
|
||||
MCRegister BaseReg = MRI.getSubReg(MO.getReg(), AMDGPU::sub0);
|
||||
MCRegister NewReg = MRI.getMatchingSuperReg(
|
||||
BaseReg, AMDGPU::sub0, &MRI.getRegClass(AMDGPU::VReg_384RegClassID));
|
||||
return MO.setReg(NewReg);
|
||||
}
|
||||
case 16:
|
||||
// No-op in cases where one operand is still f8/bf8.
|
||||
return;
|
||||
default:
|
||||
llvm_unreachable("Unexpected size for mfma f8f6f4 operand");
|
||||
llvm_unreachable("Unexpected size for mfma/wmma f8f6f4 operand");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1015,6 +1031,35 @@ void AMDGPUDisassembler::convertMAIInst(MCInst &MI) const {
|
||||
AdjustedRegClassOpcode->NumRegsSrcB);
|
||||
}
|
||||
|
||||
void AMDGPUDisassembler::convertWMMAInst(MCInst &MI) const {
|
||||
int FmtAIdx =
|
||||
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::matrix_a_fmt);
|
||||
if (FmtAIdx == -1)
|
||||
return;
|
||||
|
||||
int FmtBIdx =
|
||||
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::matrix_b_fmt);
|
||||
|
||||
unsigned FmtA = MI.getOperand(FmtAIdx).getImm();
|
||||
unsigned FmtB = MI.getOperand(FmtBIdx).getImm();
|
||||
|
||||
const AMDGPU::MFMA_F8F6F4_Info *AdjustedRegClassOpcode =
|
||||
AMDGPU::getWMMA_F8F6F4_WithFormatArgs(FmtA, FmtB, MI.getOpcode());
|
||||
if (!AdjustedRegClassOpcode ||
|
||||
AdjustedRegClassOpcode->Opcode == MI.getOpcode())
|
||||
return;
|
||||
|
||||
MI.setOpcode(AdjustedRegClassOpcode->Opcode);
|
||||
int Src0Idx =
|
||||
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
|
||||
int Src1Idx =
|
||||
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
|
||||
adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src0Idx),
|
||||
AdjustedRegClassOpcode->NumRegsSrcA);
|
||||
adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src1Idx),
|
||||
AdjustedRegClassOpcode->NumRegsSrcB);
|
||||
}
|
||||
|
||||
struct VOPModifiers {
|
||||
unsigned OpSel = 0;
|
||||
unsigned OpSelHi = 0;
|
||||
|
@ -161,6 +161,7 @@ public:
|
||||
void convertFMAanyK(MCInst &MI) const;
|
||||
void convertSDWAInst(MCInst &MI) const;
|
||||
void convertMAIInst(MCInst &MI) const;
|
||||
void convertWMMAInst(MCInst &MI) const;
|
||||
void convertDPP8Inst(MCInst &MI) const;
|
||||
void convertMIMGInst(MCInst &MI) const;
|
||||
void convertVOP3DPPInst(MCInst &MI) const;
|
||||
|
@ -1345,6 +1345,48 @@ void AMDGPUInstPrinter::printIndexKey32bit(const MCInst *MI, unsigned OpNo,
|
||||
O << " index_key:" << Imm;
|
||||
}
|
||||
|
||||
void AMDGPUInstPrinter::printMatrixFMT(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI,
|
||||
raw_ostream &O, char AorB) {
|
||||
auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
|
||||
if (Imm == 0)
|
||||
return;
|
||||
|
||||
O << " matrix_" << AorB << "_fmt:";
|
||||
switch (Imm) {
|
||||
default:
|
||||
O << Imm;
|
||||
break;
|
||||
case WMMA::MatrixFMT::MATRIX_FMT_FP8:
|
||||
O << "MATRIX_FMT_FP8";
|
||||
break;
|
||||
case WMMA::MatrixFMT::MATRIX_FMT_BF8:
|
||||
O << "MATRIX_FMT_BF8";
|
||||
break;
|
||||
case WMMA::MatrixFMT::MATRIX_FMT_FP6:
|
||||
O << "MATRIX_FMT_FP6";
|
||||
break;
|
||||
case WMMA::MatrixFMT::MATRIX_FMT_BF6:
|
||||
O << "MATRIX_FMT_BF6";
|
||||
break;
|
||||
case WMMA::MatrixFMT::MATRIX_FMT_FP4:
|
||||
O << "MATRIX_FMT_FP4";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void AMDGPUInstPrinter::printMatrixAFMT(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI,
|
||||
raw_ostream &O) {
|
||||
printMatrixFMT(MI, OpNo, STI, O, 'a');
|
||||
}
|
||||
|
||||
void AMDGPUInstPrinter::printMatrixBFMT(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI,
|
||||
raw_ostream &O) {
|
||||
printMatrixFMT(MI, OpNo, STI, O, 'b');
|
||||
}
|
||||
|
||||
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
|
||||
const MCSubtargetInfo &STI,
|
||||
raw_ostream &O) {
|
||||
|
@ -134,6 +134,12 @@ private:
|
||||
const MCSubtargetInfo &STI, raw_ostream &O);
|
||||
void printIndexKey32bit(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI, raw_ostream &O);
|
||||
void printMatrixFMT(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI, raw_ostream &O, char AorB);
|
||||
void printMatrixAFMT(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI, raw_ostream &O);
|
||||
void printMatrixBFMT(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI, raw_ostream &O);
|
||||
void printInterpSlot(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI, raw_ostream &O);
|
||||
void printInterpAttr(const MCInst *MI, unsigned OpNo,
|
||||
|
@ -384,6 +384,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
|
||||
if (((Desc.TSFlags & SIInstrFlags::VOP3P) ||
|
||||
Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
|
||||
Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) &&
|
||||
// Matrix B format operand reuses op_sel_hi.
|
||||
!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_fmt) &&
|
||||
// Matrix B reuse operand reuses op_sel_hi.
|
||||
!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) {
|
||||
Encoding |= getImplicitOpSelHiEncoding(Opcode);
|
||||
|
@ -1005,6 +1005,16 @@ enum Target : unsigned {
|
||||
|
||||
} // namespace Exp
|
||||
|
||||
namespace WMMA {
|
||||
enum MatrixFMT : unsigned {
|
||||
MATRIX_FMT_FP8 = 0,
|
||||
MATRIX_FMT_BF8 = 1,
|
||||
MATRIX_FMT_FP6 = 2,
|
||||
MATRIX_FMT_BF6 = 3,
|
||||
MATRIX_FMT_FP4 = 4
|
||||
};
|
||||
} // namespace WMMA
|
||||
|
||||
namespace VOP3PEncoding {
|
||||
|
||||
enum OpSel : uint64_t {
|
||||
|
@ -1307,6 +1307,9 @@ let PrintMethod = "printBitOp3" in
|
||||
def BitOp3 : NamedIntOperand<"bitop3">;
|
||||
def bitop3_0 : DefaultOperand<BitOp3, 0>;
|
||||
|
||||
def MatrixAFMT : CustomOperand<i32, 1, "MatrixAFMT">;
|
||||
def MatrixBFMT : CustomOperand<i32, 1, "MatrixBFMT">;
|
||||
|
||||
def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">;
|
||||
def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">;
|
||||
|
||||
@ -1882,6 +1885,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
|
||||
!eq(VT, v4bf16) : AVSrc_64,
|
||||
!eq(VT.Size, 1024) : VRegSrc_1024,
|
||||
!eq(VT.Size, 512) : VRegSrc_512,
|
||||
!eq(VT.Size, 384) : VRegSrc_384,
|
||||
!eq(VT.Size, 256) : VRegSrc_256,
|
||||
!eq(VT.Size, 192) : VRegSrc_192,
|
||||
!eq(VT.Size, 128) : VRegSrc_128,
|
||||
@ -1894,6 +1898,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
|
||||
class getVOP3VRegSrcForVT<ValueType VT> {
|
||||
RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
|
||||
!eq(VT.Size, 512) : VRegSrc_512,
|
||||
!eq(VT.Size, 384) : VRegSrc_384,
|
||||
!eq(VT.Size, 256) : VRegSrc_256,
|
||||
!eq(VT.Size, 192) : VRegSrc_192,
|
||||
!eq(VT.Size, 128) : VRegSrc_128,
|
||||
@ -2666,6 +2671,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
|
||||
HasOMod);
|
||||
field bit HasNeg = HasModifiers;
|
||||
field bit HasMatrixReuse = 0;
|
||||
field bit HasMatrixFMT = 0;
|
||||
|
||||
field bit HasSrc0Mods = HasModifiers;
|
||||
field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0);
|
||||
|
@ -1207,6 +1207,7 @@ def VRegSrc_96 : SrcReg9<VReg_96>;
|
||||
def VRegSrc_128: SrcReg9<VReg_128>;
|
||||
def VRegSrc_192: SrcReg9<VReg_192>;
|
||||
def VRegSrc_256: SrcReg9<VReg_256>;
|
||||
def VRegSrc_384: SrcReg9<VReg_384>;
|
||||
def VRegSrc_512: SrcReg9<VReg_512>;
|
||||
def VRegSrc_1024: SrcReg9<VReg_1024>;
|
||||
def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>;
|
||||
|
@ -464,6 +464,20 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
|
||||
|
||||
} // End SchedModel = GFX12SpeedModel
|
||||
|
||||
// Check if any matrix inputs are interpreted as f8 in an f8f6f4
|
||||
// wmma instruction.
|
||||
def PredIsF8_WMMA_SCALE : SchedPredicate<[{
|
||||
TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_a_fmt)->getImm() <= AMDGPU::WMMA::MATRIX_FMT_BF8 ||
|
||||
TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_b_fmt)->getImm() <= AMDGPU::WMMA::MATRIX_FMT_BF8
|
||||
}]>;
|
||||
|
||||
// If either matrix format is f8, the instruction takes 2x as many
|
||||
// cycles. TODO: This isn't reflected in MCA.
|
||||
def WriteWMMAScale_16X16X128_F8F6F4 : SchedWriteVariant<[
|
||||
SchedVar<PredIsF8_WMMA_SCALE, [WriteXDL4PassWMMA]>,
|
||||
SchedVar<NoSchedPred, [WriteXDL2PassWMMA]>
|
||||
]>;
|
||||
|
||||
multiclass GFX125xCommonWriteRes {
|
||||
|
||||
let ReleaseAtCycles = [8] in
|
||||
@ -495,6 +509,7 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
|
||||
|
||||
def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>;
|
||||
def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>;
|
||||
def : InstRW<[WriteWMMAScale_16X16X128_F8F6F4], (instregex "^V_WMMA_.*_16X16X128_F8F6F4.*_w32")>;
|
||||
def : InstRW<[Write4PassWMMA], (instregex "^V_WMMA_F32_16X16X4_F32_w32")>;
|
||||
def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>;
|
||||
} // End GFX125xCommonWriteRes
|
||||
|
@ -598,6 +598,29 @@ const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
|
||||
return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode);
|
||||
}
|
||||
|
||||
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt) {
|
||||
switch (Fmt) {
|
||||
case WMMA::MATRIX_FMT_FP8:
|
||||
case WMMA::MATRIX_FMT_BF8:
|
||||
return 16;
|
||||
case WMMA::MATRIX_FMT_FP6:
|
||||
case WMMA::MATRIX_FMT_BF6:
|
||||
return 12;
|
||||
case WMMA::MATRIX_FMT_FP4:
|
||||
return 8;
|
||||
}
|
||||
|
||||
llvm_unreachable("covered switch over wmma scale formats");
|
||||
}
|
||||
|
||||
const MFMA_F8F6F4_Info *getWMMA_F8F6F4_WithFormatArgs(unsigned FmtA,
|
||||
unsigned FmtB,
|
||||
unsigned F8F8Opcode) {
|
||||
uint8_t SrcANumRegs = wmmaScaleF8F6F4FormatToNumRegs(FmtA);
|
||||
uint8_t SrcBNumRegs = wmmaScaleF8F6F4FormatToNumRegs(FmtB);
|
||||
return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode);
|
||||
}
|
||||
|
||||
unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST) {
|
||||
if (ST.hasFeature(AMDGPU::FeatureGFX1250Insts))
|
||||
return SIEncodingFamily::GFX1250;
|
||||
|
@ -627,6 +627,14 @@ const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
|
||||
unsigned BLGP,
|
||||
unsigned F8F8Opcode);
|
||||
|
||||
LLVM_READNONE
|
||||
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt);
|
||||
|
||||
LLVM_READONLY
|
||||
const MFMA_F8F6F4_Info *getWMMA_F8F6F4_WithFormatArgs(unsigned FmtA,
|
||||
unsigned FmtB,
|
||||
unsigned F8F8Opcode);
|
||||
|
||||
LLVM_READONLY
|
||||
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
|
||||
uint8_t NumComponents,
|
||||
|
@ -1318,13 +1318,15 @@ let WaveSizePredicate = isWave64 in {
|
||||
|
||||
class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0,
|
||||
bit _HasMatrixReuse = 0, bit _IsF4 = 0>
|
||||
bit _HasMatrixFMT = 0, bit _HasMatrixReuse = 0,
|
||||
bit _IsF4 = 0>
|
||||
: VOP3P_Profile<VOPProfile<ArgTy>> {
|
||||
bit IsIU = _IsIU;
|
||||
bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B
|
||||
bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32));
|
||||
|
||||
int IndexType = _IndexType;
|
||||
let HasMatrixFMT = _HasMatrixFMT;
|
||||
let HasMatrixReuse = _HasMatrixReuse;
|
||||
|
||||
bit HasIModOp = _Has_ImodOp;
|
||||
@ -1422,7 +1424,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
!eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
|
||||
!eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit),
|
||||
!eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit));
|
||||
|
||||
dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt),
|
||||
(ins));
|
||||
dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins));
|
||||
dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins));
|
||||
dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
|
||||
@ -1436,7 +1439,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
(ins VRegSrc_64:$src2),
|
||||
(ins VRegSrc_32:$src2)),
|
||||
IndexKey)),
|
||||
MatrixReuse, Clamp, Neg);
|
||||
MatrixFMT, MatrixReuse, Clamp, Neg);
|
||||
|
||||
// asm
|
||||
|
||||
@ -1444,13 +1447,14 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
!eq(IndexType, 8) : "$index_key_8bit",
|
||||
!eq(IndexType, 16) : "$index_key_16bit",
|
||||
!eq(IndexType, 32) : "$index_key_32bit");
|
||||
string MatrxFMTAsm = !if(HasMatrixFMT, "$matrix_a_fmt$matrix_b_fmt", "");
|
||||
string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", "");
|
||||
string ClampAsm = !if(HasClamp, "$clamp", "");
|
||||
string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi",
|
||||
!and(NegLoAny, !not(NegHiAny)) : "$neg_lo",
|
||||
!and(!not(NegLoAny), !not(NegHiAny)) : "");
|
||||
|
||||
let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixReuseAsm#NegAsm#ClampAsm;
|
||||
let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrxFMTAsm#MatrixReuseAsm#NegAsm#ClampAsm;
|
||||
|
||||
// isel patterns
|
||||
bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp));
|
||||
@ -1462,6 +1466,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
|
||||
IsAB_BF16_IMod0 : (ins Src0VT:$src0),
|
||||
IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
|
||||
HasMatrixFMT : (ins timm:$matrix_a_fmt, Src0VT:$src0),
|
||||
NoABMods : (ins Src0VT:$src0));
|
||||
dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
|
||||
IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
|
||||
@ -1474,6 +1479,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
|
||||
IsAB_BF16_IMod0 : (ins Src1VT:$src1),
|
||||
IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
|
||||
HasMatrixFMT : (ins timm:$matrix_b_fmt, Src1VT:$src1),
|
||||
NoABMods : (ins Src1VT:$src1));
|
||||
dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
|
||||
IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
|
||||
@ -1499,7 +1505,6 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
IsIUXF32 : (ins Src2VT:$src2),
|
||||
IsSWMMAC : (ins));
|
||||
dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins));
|
||||
|
||||
dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
|
||||
!eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
|
||||
!eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))),
|
||||
@ -1508,6 +1513,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
!eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
|
||||
!eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit),
|
||||
!eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit));
|
||||
dag MatrixFMTOutPat = !if(HasMatrixFMT, (ins i32:$matrix_a_fmt, i32:$matrix_b_fmt), (ins));
|
||||
dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2))));
|
||||
dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2));
|
||||
|
||||
@ -1515,7 +1521,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins));
|
||||
|
||||
dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat);
|
||||
dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixReuseOutModPat, ClampPat);
|
||||
dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat);
|
||||
|
||||
dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat);
|
||||
dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat);
|
||||
@ -1523,7 +1529,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
// wmma pattern where src2 is inline imm uses _threeaddr pseudo,
|
||||
// can't use _twoaddr since it would violate src2 tied to vdst constraint.
|
||||
dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat);
|
||||
dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixReuseOutModPat, ClampPat);
|
||||
dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat);
|
||||
}
|
||||
|
||||
def WMMAInstInfoTable : GenericTable {
|
||||
@ -1632,26 +1638,45 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1,
|
||||
// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
|
||||
// for matrix A, index is i16; Matrix B uses all lanes
|
||||
|
||||
def F64_F64X4_WMMA_w32 : VOP3PWMMA_Profile<[v8f64, v2f64, v2f64, v8f64], 0, 0, 0, 0, 1>;
|
||||
def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 1>;
|
||||
def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
|
||||
def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 1>;
|
||||
def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 1>;
|
||||
def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 1>;
|
||||
def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
|
||||
def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1>;
|
||||
def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1>;
|
||||
def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 1>;
|
||||
def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 1>;
|
||||
def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 1>;
|
||||
def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 1>;
|
||||
def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 1>;
|
||||
def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 1>;
|
||||
def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 1>;
|
||||
def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 1>;
|
||||
def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 1>;
|
||||
def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 1>;
|
||||
def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 1>;
|
||||
def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 1>;
|
||||
def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
|
||||
def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
|
||||
def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 1>;
|
||||
def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 1>;
|
||||
def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
|
||||
def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 1>;
|
||||
def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 1>;
|
||||
def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 1>;
|
||||
def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 1>;
|
||||
def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 1>;
|
||||
def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 1>;
|
||||
def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 1>;
|
||||
def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 1>;
|
||||
def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 1>;
|
||||
def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 1>;
|
||||
def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 1>;
|
||||
def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 1>;
|
||||
def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 1>;
|
||||
|
||||
multiclass WMMA_F8F6F4_Profiles<bit HasMatrixReuse> {
|
||||
def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
|
||||
def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
|
||||
def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
|
||||
def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
|
||||
def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
|
||||
def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
|
||||
def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
|
||||
def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
|
||||
def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
|
||||
}
|
||||
|
||||
defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0>;
|
||||
|
||||
multiclass WMMAInst_SrcFormats_mc<string OpName, string Profile> {
|
||||
foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
|
||||
defm _#I#_w32 : WMMAInstGFX12<OpName # "_" # I # "_w32", !cast<VOP3PWMMA_Profile>(Profile # "_" # I # "_w32"), "_w32">;
|
||||
}
|
||||
}
|
||||
|
||||
let WaveSizePredicate = isWave32 in {
|
||||
let SubtargetPredicate = isGFX125xOnly in {
|
||||
@ -1697,6 +1722,8 @@ defm V_SWMMAC_I32_16X16X128_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x12
|
||||
defm V_SWMMAC_F32_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_f16", F32_F16X64_SWMMAC_w32, "_w32">;
|
||||
defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16", F16_F16X64_SWMMAC_w32, "_w32">;
|
||||
|
||||
defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">;
|
||||
|
||||
} // End is_wmma_xdl = 1.
|
||||
|
||||
} // End SubtargetPredicate = isGFX125xOnly
|
||||
@ -1854,6 +1881,10 @@ let SubtargetPredicate = isGFX125xOnly in {
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>;
|
||||
defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>;
|
||||
|
||||
foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>;
|
||||
}
|
||||
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
|
||||
def : SWMMACPat<V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x64_bf16, BF16_BF16X64_SWMMAC_w32>;
|
||||
def : SWMMACPat<V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
|
||||
@ -1912,17 +1943,22 @@ multiclass VOP3P_Real_Base<GFXGen Gen, bits<8> op, string backing_ps_name = NAME
|
||||
|
||||
class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
|
||||
: VOP3Pe_gfx11_gfx12<op, P>{
|
||||
|
||||
// opsel
|
||||
let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0,
|
||||
let Inst{11} = !cond(WMMAP.HasMatrixFMT : matrix_a_fmt{0},
|
||||
!eq(WMMAP.IndexType, 0) : 0,
|
||||
!eq(WMMAP.IndexType, 8) : index_key_8bit{0},
|
||||
!eq(WMMAP.IndexType, 16) : index_key_16bit{0},
|
||||
!eq(WMMAP.IndexType, 32) : index_key_32bit{0});
|
||||
let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
|
||||
let Inst{13} = !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0);
|
||||
let Inst{12} = !if(WMMAP.HasMatrixFMT, matrix_a_fmt{1},
|
||||
!if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0));
|
||||
let Inst{13} = !if (WMMAP.HasMatrixFMT, matrix_a_fmt{2},
|
||||
!if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0));
|
||||
// opsel_hi
|
||||
let Inst{59} = 1;
|
||||
let Inst{60} = 1;
|
||||
let Inst{14} = !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1);
|
||||
let Inst{59} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{0}, 1);
|
||||
let Inst{60} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{1}, 1);
|
||||
let Inst{14} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{2},
|
||||
!if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1));
|
||||
// neg_lo
|
||||
let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
|
||||
let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
|
||||
@ -1961,6 +1997,24 @@ multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
|
||||
}
|
||||
}
|
||||
|
||||
multiclass VOP3P_Real_WMMA_F8F6F4_gfx1250<bits<8> op, VOP3PWMMA_Profile WMMAP> {
|
||||
defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
|
||||
defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
|
||||
defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
|
||||
let AsmString = asmName # PS.AsmOperands in
|
||||
defm NAME : VOP3P_Real_WMMA_gfx1250<op, WMMAP>,
|
||||
MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_twoaddr_gfx1250">;
|
||||
}
|
||||
|
||||
multiclass VOP3P_Real_WMMA_gfx1250_SrcFormats<bits<8> op, string WMMAP> {
|
||||
defm _f8_f8_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
|
||||
foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
|
||||
let isAsmParserOnly = true in { // Disable ambiguous disassembly.
|
||||
defm _#I#_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
|
||||
defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
|
||||
@ -2035,6 +2089,8 @@ defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8B
|
||||
defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>;
|
||||
defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>;
|
||||
|
||||
defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x033, "F32_16X16X128_F8F6F4">;
|
||||
|
||||
defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F16_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>;
|
||||
|
@ -453,6 +453,8 @@ class VOP3Pe_Base {
|
||||
bits<2> index_key_8bit;
|
||||
bits<1> index_key_16bit;
|
||||
bits<1> index_key_32bit;
|
||||
bits<3> matrix_a_fmt;
|
||||
bits<3> matrix_b_fmt;
|
||||
bits<1> matrix_a_reuse;
|
||||
bits<1> matrix_b_reuse;
|
||||
}
|
||||
|
@ -302,6 +302,14 @@ define amdgpu_kernel void @wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
define amdgpu_ps void @wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %tmp0, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHRCK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 false, <16 x half> %A, i1 false, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
|
||||
define amdgpu_ps void @swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
|
||||
@ -836,6 +844,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>,
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
|
||||
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1)
|
||||
|
@ -304,6 +304,556 @@ bb:
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 1, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off
|
||||
; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 2, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
|
||||
; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off
|
||||
; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off
|
||||
; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off
|
||||
; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||
; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v8i32(i32 4, <8 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
@ -815,6 +1365,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>,
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
|
||||
@ -824,6 +1375,7 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
|
||||
declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
|
||||
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
|
||||
declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
|
||||
|
@ -1342,6 +1342,110 @@ bb:
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0
|
||||
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34
|
||||
; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
|
||||
; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: s_mov_b32 s0, 1.0
|
||||
; GISEL-NEXT: s_mov_b32 s1, 2.0
|
||||
; GISEL-NEXT: s_mov_b32 s6, s0
|
||||
; GISEL-NEXT: s_mov_b32 s7, s0
|
||||
; GISEL-NEXT: s_mov_b32 s2, s0
|
||||
; GISEL-NEXT: s_mov_b32 s3, s0
|
||||
; GISEL-NEXT: s_mov_b32 s4, s0
|
||||
; GISEL-NEXT: s_mov_b32 s5, s0
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
|
||||
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
|
||||
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
|
||||
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
|
||||
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GISEL-NEXT: s_mov_b32 s6, s0
|
||||
; GISEL-NEXT: s_mov_b32 s7, s0
|
||||
; GISEL-NEXT: s_mov_b32 s1, s0
|
||||
; GISEL-NEXT: s_mov_b32 s2, s0
|
||||
; GISEL-NEXT: s_mov_b32 s3, s0
|
||||
; GISEL-NEXT: s_mov_b32 s4, s0
|
||||
; GISEL-NEXT: s_mov_b32 s5, s0
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
|
||||
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
|
||||
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
@ -2227,6 +2331,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>,
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
|
||||
|
@ -1126,6 +1126,72 @@ bb:
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 1, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 3, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39]
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39]
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
|
||||
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 4, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
@ -1967,6 +2033,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>,
|
||||
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
|
||||
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
|
||||
|
@ -923,6 +923,71 @@ v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47]
|
||||
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
|
||||
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
|
||||
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
|
||||
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
|
||||
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
|
||||
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
|
||||
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
|
||||
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
|
||||
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
|
||||
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0
|
||||
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1]
|
||||
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1]
|
||||
// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19]
|
||||
// GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x1c]
|
||||
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
|
||||
|
@ -363,6 +363,82 @@ v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:2
|
||||
v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,0,1]
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[1,0,0]
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,1,0]
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[1,0,0]
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,1,0]
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] clamp
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:-1
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid matrix_b_fmt value
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:xxx
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid matrix_b_fmt value
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47]
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
|
||||
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47]
|
||||
// GFX1250-ERR-NEXT: {{^}} ^
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP8
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
|
||||
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP8
|
||||
// GFX1250-ERR-NEXT: {{^}} ^
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF8
|
||||
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
|
||||
// GFX1250-ERR-NEXT: {{^}} ^
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP6
|
||||
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
|
||||
// GFX1250-ERR-NEXT: {{^}} ^
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF6
|
||||
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
|
||||
// GFX1250-ERR-NEXT: {{^}} ^
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP4
|
||||
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
|
||||
// GFX1250-ERR-NEXT: {{^}} ^
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP8
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
|
||||
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP8
|
||||
// GFX1250-ERR-NEXT: {{^}} ^
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF8
|
||||
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
|
||||
// GFX1250-ERR-NEXT: {{^}} ^
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP6
|
||||
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
|
||||
// GFX1250-ERR-NEXT: {{^}} ^
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF6
|
||||
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
|
||||
// GFX1250-ERR-NEXT: {{^}} ^
|
||||
|
||||
v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP4
|
||||
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
|
||||
// GFX1250-ERR-NEXT: {{^}} ^
|
||||
|
||||
v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]
|
||||
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
|
||||
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]
|
||||
|
@ -364,6 +364,45 @@
|
||||
0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c
|
||||
# GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c]
|
||||
|
||||
0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04
|
||||
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
|
||||
0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04
|
||||
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
|
||||
0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04
|
||||
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
|
||||
0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04
|
||||
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
|
||||
0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14
|
||||
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14]
|
||||
|
||||
0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c
|
||||
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c]
|
||||
|
||||
0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14
|
||||
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14]
|
||||
|
||||
0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03
|
||||
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03]
|
||||
|
||||
0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
|
||||
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
|
||||
0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04
|
||||
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
|
||||
0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c
|
||||
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c]
|
||||
|
||||
0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04
|
||||
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04]
|
||||
|
||||
0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84
|
||||
# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84]
|
||||
|
||||
0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b
|
||||
# GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b]
|
||||
|
||||
|
158
llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll
Normal file
158
llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll
Normal file
@ -0,0 +1,158 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s
|
||||
|
||||
; ------------------------------------------------------------------------------------
|
||||
; Incorrect signature for format cases (IR vector too large) wmma.f32.16x16x128.f8f6f4
|
||||
; ------------------------------------------------------------------------------------
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8(
|
||||
; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
|
||||
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 2, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6(
|
||||
; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
|
||||
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8(
|
||||
; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> [[TMP0]], i32 1, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
|
||||
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6(
|
||||
; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 3, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
|
||||
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 3, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8(
|
||||
; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
|
||||
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4(
|
||||
; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
|
||||
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8(
|
||||
; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
|
||||
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4(
|
||||
; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
|
||||
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6(
|
||||
; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> [[TMP0]], i32 2, <12 x i32> [[TMP1]], i16 0, <8 x float> [[C]])
|
||||
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4(
|
||||
; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> [[TMP0]], i32 4, <8 x i32> [[TMP1]], i16 0, <8 x float> [[C]])
|
||||
; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
165
llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll
Normal file
165
llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll
Normal file
@ -0,0 +1,165 @@
|
||||
; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; Wrong mangled types
|
||||
; --------------------------------------------------------------------
|
||||
|
||||
; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: <16 x i64> %A
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i64_fp8___v16i32_fp8(<16 x i64> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: <16 x i64> %B
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i64_fp8(<16 x i32> %A, <16 x i64> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; Impossible vector types
|
||||
; --------------------------------------------------------------------
|
||||
|
||||
; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v9i32.v16i32(i32 0, <9 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: <9 x i32> %A
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v9i32_fp8___v16i32_fp8(<9 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v9i32.v16i32(i32 0, <9 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v9i32(i32 0, <16 x i32> %A, i32 0, <9 x i32> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: <9 x i32> %B
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v9i32_fp8(<16 x i32> %A, <9 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v9i32(i32 0, <16 x i32> %A, i32 0, <9 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; Out of bounds format
|
||||
; --------------------------------------------------------------------
|
||||
|
||||
; CHECK: invalid value for matrix format
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 9999, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: i32 9999
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_invalid0___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 9999, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: invalid value for matrix format
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 9999, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: i32 9999
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_invalid1(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 9999, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; Incorrect signature for format cases (IR vector too small)
|
||||
; --------------------------------------------------------------------
|
||||
|
||||
; CHECK: invalid vector type for format
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: <8 x i32> %A
|
||||
; CHECK-NEXT: i32 0
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v8i32_fp8___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: invalid vector type for format
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 0, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: <12 x i32> %A
|
||||
; CHECK-NEXT: i32 0
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp8___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 0, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: invalid vector type for format
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 1, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: <8 x i32> %A
|
||||
; CHECK-NEXT: i32 1
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v8i32_bf8___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 1, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: invalid vector type for format
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 1, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: <12 x i32> %A
|
||||
; CHECK-NEXT: i32 1
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_bf8___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 1, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: invalid vector type for format
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 0, <8 x i32> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: <8 x i32> %B
|
||||
; CHECK-NEXT: i32 0
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_fp8(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 0, <8 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: invalid vector type for format
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 0, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: <12 x i32> %B
|
||||
; CHECK-NEXT: i32 0
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp8(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 0, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: invalid vector type for format
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 1, <8 x i32> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: <8 x i32> %B
|
||||
; CHECK-NEXT: i32 1
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_bf8(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 1, <8 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: invalid vector type for format
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 1, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
; CHECK-NEXT: <12 x i32> %B
|
||||
; CHECK-NEXT: i32 1
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_bf8(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 1, <12 x i32> %B, i16 0, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user