[AMDGPU] Add gfx1250 wmma_scale[16]_f32_32x16x128_f4 instructions (#152194)

This commit is contained in:
Stanislav Mekhanoshin 2025-08-05 15:15:21 -07:00 committed by GitHub
parent 5a076e3b4d
commit 34aed0ed56
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 1027 additions and 0 deletions

View File

@ -805,6 +805,8 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, "V8fIiV16iIiV
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_f16, "V8fIbV16hIbV16hIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x32_f16, "V8hIbV16hIbV16hIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_32x16x128_f4, "V16fV16iV8iIsV16f", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_scale_f32_32x16x128_f4, "V16fV16iV8iIsV16fIiIiiIiIiiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4, "V16fV16iV8iIsV16fIiIiLiIiIiLiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x64_bf16, "V8yIbV16yIbV32yV8yiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")

View File

@ -894,6 +894,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
@ -1172,6 +1174,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
ArgsForMatchingMatrixTypes = {3, 0, 1};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
ArgsForMatchingMatrixTypes = {3, 0, 1};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
ArgsForMatchingMatrixTypes = {3, 0, 1};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;

View File

@ -314,6 +314,28 @@ void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b,
*out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, 0, c);
}
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale_f32_32x16x128_f4(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]], i32 1, i32 2, i32 [[SCALE_SRC0:%.*]], i32 2, i32 1, i32 [[SCALE_SRC1:%.*]], i1 false, i1 true)
// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA4]]
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_wmma_scale_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int scale_src0, int scale_src1)
{
*out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 2, scale_src0, 2, 1, scale_src1, 0, 1);
}
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale16_f32_32x16x128_f4(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]], i32 1, i32 2, i64 [[SCALE_SRC0:%.*]], i32 2, i32 1, i64 [[SCALE_SRC1:%.*]], i1 false, i1 true)
// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA4]]
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_wmma_scale16_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, long scale_src0, long scale_src1)
{
*out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 2, scale_src0, 2, 1, scale_src1, 0, 1);
}
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_bf16(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)

View File

@ -230,6 +230,32 @@ void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b,
*out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, mod, c); // expected-error {{'__builtin_amdgcn_wmma_f32_32x16x128_f4' must be a constant integer}}
}
void test_amdgcn_wmma_scale_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int mod, int scale_src0, int scale_src1, bool reuse)
{
*out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, mod, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, mod, 0, scale_src0, 2, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, mod, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, reuse, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, reuse); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, mod, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, mod, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, mod, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, mod); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
}
void test_amdgcn_wmma_scale16_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int mod, long scale_src0, long scale_src1, bool reuse)
{
*out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, mod, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, mod, 0, scale_src0, 2, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, mod, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, reuse, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, reuse); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, mod, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, mod, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, mod, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
*out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, mod); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
}
void test_amdgcn_swmmac_f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index, int mod)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be a constant integer}}

View File

@ -3956,6 +3956,28 @@ class AMDGPUWmmaScaleIntrinsicModsC<LLVMType scale_ty> :
IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
class AMDGPUWmmaScaleF4IntrinsicModsC<LLVMType scale_ty> :
Intrinsic<
[llvm_anyfloat_ty], // %D
[
llvm_anyint_ty, // %A
llvm_anyint_ty, // %B
llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
LLVMMatchType<0>, // %C
llvm_i32_ty, // matrix_a_scale
llvm_i32_ty, // matrix_a_scale_fmt
scale_ty, // matrix a scale exponential
llvm_i32_ty, // matrix_b_scale
llvm_i32_ty, // matrix_b_scale_fmt
scale_ty, // matrix b scale exponential
llvm_i1_ty, // matrix_a_reuse
llvm_i1_ty, // matrix_b_reuse
],
[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<7>>,
ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<10>>, ImmArg<ArgIndex<11>>,
IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
@ -3984,6 +4006,8 @@ def int_amdgcn_wmma_f32_16x16x128_f8f6f4 : AMDGPUWmmaIntrinsicModsC_MatrixFMT;
def int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4 : AMDGPUWmmaScaleIntrinsicModsC<llvm_i32_ty>;
def int_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4 : AMDGPUWmmaScaleIntrinsicModsC<llvm_i64_ty>;
def int_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUWmmaIntrinsicF4ModsC<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_scale_f32_32x16x128_f4 : AMDGPUWmmaScaleF4IntrinsicModsC<llvm_i32_ty>;
def int_amdgcn_wmma_scale16_f32_32x16x128_f4 : AMDGPUWmmaScaleF4IntrinsicModsC<llvm_i64_ty>;
}
class AMDGPUSWmmacIntrinsicABIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :

View File

@ -4801,6 +4801,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
case Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4:
case Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4:
case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:

View File

@ -1763,6 +1763,8 @@ def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16
def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 0, 0, 1>;
def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1>;
def F32_32X16X128_F4_SCALE_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 0, 1>;
def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 1, 1>;
def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
@ -1852,6 +1854,9 @@ defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64
defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">;
defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE">;
defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale16_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE16">;
defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16x128_f4", F32_32X16X128_F4_SCALE_w32, "_w32">;
defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
} // End is_wmma_xdl = 1.
defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32>>;
@ -2010,6 +2015,8 @@ let SubtargetPredicate = isGFX125xOnly in {
defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>;
defm : WMMAPat<"V_WMMA_SCALE_F32_32X16X128_F4_w32", int_amdgcn_wmma_scale_f32_32x16x128_f4, F32_32X16X128_F4_SCALE_w32>;
defm : WMMAPat<"V_WMMA_SCALE16_F32_32X16X128_F4_w32", int_amdgcn_wmma_scale16_f32_32x16x128_f4, F32_32X16X128_F4_SCALE16_w32>;
foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>;
@ -2191,6 +2198,15 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
let Inst{127} = !if(P.NegLo2, src2_modifiers{0}, 0);
}
multiclass VOP3PX2_Real_ScaledWMMA_F4<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32,
DecoderNamespace = "GFX1250" in {
def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, PS.Mnemonic>,
VOP3PX2e <op, LdScaleOp, WMMAP>;
}
}
multiclass VOP3PX2_Real_ScaledWMMA<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
@ -2292,6 +2308,9 @@ defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x0
defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">;
defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">;
defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x35, F32_32X16X128_F4_SCALE_w32>;
defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x3a, F32_32X16X128_F4_SCALE16_w32>;
defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
defm V_SWMMAC_F16_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>;

View File

@ -2236,6 +2236,170 @@ bb:
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[42:43], v[32:35], off offset:32
; GFX1250-NEXT: global_store_b128 v[42:43], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[42:43], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[42:43], v[24:27], off
; GISEL-NEXT: global_store_b128 v[42:43], v[28:31], off offset:16
; GISEL-NEXT: global_store_b128 v[42:43], v[32:35], off offset:32
; GISEL-NEXT: global_store_b128 v[42:43], v[36:39], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 false, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_ss:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_ss:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off
; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 2, i32 1, i32 %scale_src0, i32 1, i32 2, i32 %scale_src1, i1 true, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_movk_i32 s1, 0x64
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_mov_b32_e32 v42, 0x64
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off
; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 3, i32 2, i32 %scale_src0, i32 0, i32 1, i32 100, i1 false, i1 true)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v[40:41], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[44:45], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[44:45], v[32:35], off offset:32
; GFX1250-NEXT: global_store_b128 v[44:45], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[44:45], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v[40:41], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[44:45], v[24:27], off
; GISEL-NEXT: global_store_b128 v[44:45], v[28:31], off offset:16
; GISEL-NEXT: global_store_b128 v[44:45], v[32:35], off offset:32
; GISEL-NEXT: global_store_b128 v[44:45], v[36:39], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 1, i32 0, i64 %scale_src0, i32 1, i32 0, i64 %scale_src1, i1 false, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_ss:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_ss:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off
; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 2, i32 1, i64 %scale_src0, i32 1, i32 2, i64 %scale_src1, i1 true, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off
; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 3, i32 2, i64 %scale_src0, i32 0, i32 1, i64 100, i1 false, i1 true)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16:
; GFX1250: ; %bb.0: ; %bb
@ -2573,6 +2737,8 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
declare <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
declare <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i64, i32, i32, i64, i1, i1)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1)

View File

@ -2530,6 +2530,312 @@ bb:
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 true, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26
; GFX1250-NEXT: v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26
; GFX1250-NEXT: v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26
; GFX1250-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26
; GFX1250-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v26
; GFX1250-NEXT: v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26
; GFX1250-NEXT: v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s1, 2.0
; GISEL-NEXT: s_mov_b32 s14, s0
; GISEL-NEXT: s_mov_b32 s15, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s8, s0
; GISEL-NEXT: s_mov_b32 s9, s0
; GISEL-NEXT: s_mov_b32 s10, s0
; GISEL-NEXT: s_mov_b32 s11, s0
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i32 1, i32 1, i32 0, i32 2, i1 false, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
; GFX1250-NEXT: s_movk_i32 s0, 0x65
; GFX1250-NEXT: s_movk_i32 s1, 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26
; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
; GFX1250-NEXT: v_mov_b32_e32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: v_mov_b32_e32 v42, 0x64
; GISEL-NEXT: s_mov_b32 s14, s0
; GISEL-NEXT: s_mov_b32 s15, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s8, s0
; GISEL-NEXT: s_mov_b32 s9, s0
; GISEL-NEXT: s_mov_b32 s10, s0
; GISEL-NEXT: s_mov_b32 s11, s0
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GISEL-NEXT: v_mov_b32_e32 v43, 0x65
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v42, v43 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i32 1, i32 0, i32 100, i32 1, i32 0, i32 101, i1 true, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i64 %scale_src0, i32 1, i32 0, i64 %scale_src1, i1 true, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26
; GFX1250-NEXT: v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26
; GFX1250-NEXT: v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26
; GFX1250-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26
; GFX1250-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v26
; GFX1250-NEXT: v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26
; GFX1250-NEXT: v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s1, 2.0
; GISEL-NEXT: s_mov_b32 s14, s0
; GISEL-NEXT: s_mov_b32 s15, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s8, s0
; GISEL-NEXT: s_mov_b32 s9, s0
; GISEL-NEXT: s_mov_b32 s10, s0
; GISEL-NEXT: s_mov_b32 s11, s0
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i64 1, i32 1, i32 0, i64 2, i1 false, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65
; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26
; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
; GFX1250-NEXT: v_mov_b32_e32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64
; GISEL-NEXT: s_mov_b32 s14, s0
; GISEL-NEXT: s_mov_b32 s15, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s8, s0
; GISEL-NEXT: s_mov_b32 s9, s0
; GISEL-NEXT: s_mov_b32 s10, s0
; GISEL-NEXT: s_mov_b32 s11, s0
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x65
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v[42:43], v[44:45] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i32 1, i32 0, i64 100, i32 1, i32 0, i64 101, i1 true, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
@ -2557,3 +2863,5 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
declare <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
declare <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i64, i32, i32, i64, i1, i1)

View File

@ -1882,6 +1882,162 @@ bb:
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_negC:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_negC:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off
; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 1, <16 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_neg_absC:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_neg_absC:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off
; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 3, <16 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_ignoreC:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_ignoreC:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off
; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 4, <16 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_negC:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_negC:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off
; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 1, <16 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_neg_absC:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_neg_absC:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off
; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 3, <16 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_ignoreC:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_ignoreC:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off
; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16
; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 4, <16 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16_negA:
; GFX1250: ; %bb.0: ; %bb
@ -2177,6 +2333,8 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
declare <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
declare <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i64, i32, i32, i64, i1, i1)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1)

View File

@ -1737,3 +1737,173 @@ v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:
// GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW0
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW0
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1]
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW0
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW0
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU

View File

@ -449,6 +449,16 @@ v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MAT
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47], v1, v2
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
// GFX1250-ERR-NEXT: {{^}}v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47], v1, v2
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP6
// GFX1250-ERR-NEXT: {{^}}v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]
@ -468,3 +478,23 @@ v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,1,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,1,0]
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 neg_lo:[1,0,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
// GFX1250-ERR-NEXT: {{^}}v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 neg_lo:[1,0,0]
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_fmt:0
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
// GFX1250-ERR-NEXT: {{^}}v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_fmt:0
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[0:1], v[2:3] neg_lo:[1,0,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
// GFX1250-ERR-NEXT: {{^}}v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[0:1], v[2:3] neg_lo:[1,0,0]
// GFX1250-ERR-NEXT: {{^}} ^
v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[0:1], v[2:3] matrix_a_fmt:0
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
// GFX1250-ERR-NEXT: {{^}}v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[0:1], v[2:3] matrix_a_fmt:0
// GFX1250-ERR-NEXT: {{^}} ^

View File

@ -999,3 +999,93 @@
0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c
# GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c]
0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]