From 34aed0ed5615583a8f1aaf9c036cc69fa88b3503 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 5 Aug 2025 15:15:21 -0700 Subject: [PATCH] [AMDGPU] Add gfx1250 wmma_scale[16]_f32_32x16x128_f4 instructions (#152194) --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 10 + .../builtins-amdgcn-gfx1250-wmma-w32.cl | 22 ++ ...ins-amdgcn-error-gfx1250-wmma-w32-param.cl | 26 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 24 ++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 + llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 19 ++ .../AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll | 166 ++++++++++ .../llvm.amdgcn.wmma.imm.gfx1250.w32.ll | 308 ++++++++++++++++++ .../llvm.amdgcn.wmma.imod.gfx1250.w32.ll | 158 +++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s | 170 ++++++++++ .../test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s | 30 ++ .../AMDGPU/gfx1250_dasm_wmma_w32.txt | 90 +++++ 13 files changed, 1027 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 5821a69b4de1..b16d4a22207a 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -805,6 +805,8 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, "V8fIiV16iIiV TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_f16, "V8fIbV16hIbV16hIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x32_f16, "V8hIbV16hIbV16hIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_32x16x128_f4, "V16fV16iV8iIsV16f", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_scale_f32_32x16x128_f4, "V16fV16iV8iIsV16fIiIiiIiIiiIbIb", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4, "V16fV16iV8iIsV16fIiIiLiIiIiLiIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x64_bf16, "V8yIbV16yIbV32yV8yiIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index e450b1415a04..dad1f95ac710 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -894,6 +894,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4: case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4: case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: + case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4: + case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16: case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16: case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16: @@ -1172,6 +1174,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, ArgsForMatchingMatrixTypes = {3, 0, 1}; BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4; break; + case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4: + ArgsForMatchingMatrixTypes = {3, 0, 1}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4; + break; + case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4: + ArgsForMatchingMatrixTypes = {3, 0, 1}; + BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4; + break; case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16: ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl index af8a457bec68..bdb1a7f0bb32 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl @@ -314,6 +314,28 @@ void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, *out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, 0, c); } +// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale_f32_32x16x128_f4( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]], i32 1, i32 2, i32 [[SCALE_SRC0:%.*]], i32 2, i32 1, i32 [[SCALE_SRC1:%.*]], i1 false, i1 true) +// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA4]] +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_wmma_scale_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int scale_src0, int scale_src1) +{ + *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 2, scale_src0, 2, 1, scale_src1, 0, 1); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale16_f32_32x16x128_f4( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]], i32 1, i32 2, i64 [[SCALE_SRC0:%.*]], i32 2, i32 1, i64 [[SCALE_SRC1:%.*]], i1 false, i1 true) +// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA4]] +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_wmma_scale16_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, long scale_src0, long scale_src1) +{ + *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 2, scale_src0, 2, 1, scale_src1, 0, 1); +} + // CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_bf16( // CHECK-GFX1250-NEXT: entry: // CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true) diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl index 83fdc8c8c60b..49ef2e571740 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl @@ -230,6 +230,32 @@ void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, *out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, mod, c); // expected-error {{'__builtin_amdgcn_wmma_f32_32x16x128_f4' must be a constant integer}} } +void test_amdgcn_wmma_scale_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int mod, int scale_src0, int scale_src1, bool reuse) +{ + *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, mod, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, mod, 0, scale_src0, 2, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, mod, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, reuse, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, reuse); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, mod, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, mod, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, mod, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, mod); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}} +} + +void test_amdgcn_wmma_scale16_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int mod, long scale_src0, long scale_src1, bool reuse) +{ + *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, mod, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, mod, 0, scale_src0, 2, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, mod, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, reuse, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, reuse); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, mod, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, mod, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, mod, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}} + *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, mod); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}} +} + void test_amdgcn_swmmac_f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index, int mod) { *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be a constant integer}} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index bfadc6a58f7f..90cfd8cedd51 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3956,6 +3956,28 @@ class AMDGPUWmmaScaleIntrinsicModsC : IntrWillReturn, IntrNoCallback, IntrNoFree] >; +class AMDGPUWmmaScaleF4IntrinsicModsC : + Intrinsic< + [llvm_anyfloat_ty], // %D + [ + llvm_anyint_ty, // %A + llvm_anyint_ty, // %B + llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs) + LLVMMatchType<0>, // %C + llvm_i32_ty, // matrix_a_scale + llvm_i32_ty, // matrix_a_scale_fmt + scale_ty, // matrix a scale exponential + llvm_i32_ty, // matrix_b_scale + llvm_i32_ty, // matrix_b_scale_fmt + scale_ty, // matrix b scale exponential + llvm_i1_ty, // matrix_a_reuse + llvm_i1_ty, // matrix_b_reuse + ], + [IntrNoMem, IntrConvergent, ImmArg>, ImmArg>, ImmArg>, ImmArg>, + ImmArg>, ImmArg>, ImmArg>, + IntrWillReturn, IntrNoCallback, IntrNoFree] +>; + defset list AMDGPUWMMAIntrinsicsGFX1250 = { def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsAllReuse; def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse; @@ -3984,6 +4006,8 @@ def int_amdgcn_wmma_f32_16x16x128_f8f6f4 : AMDGPUWmmaIntrinsicModsC_MatrixFMT; def int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4 : AMDGPUWmmaScaleIntrinsicModsC; def int_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4 : AMDGPUWmmaScaleIntrinsicModsC; def int_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUWmmaIntrinsicF4ModsC; +def int_amdgcn_wmma_scale_f32_32x16x128_f4 : AMDGPUWmmaScaleF4IntrinsicModsC; +def int_amdgcn_wmma_scale16_f32_32x16x128_f4 : AMDGPUWmmaScaleF4IntrinsicModsC; } class AMDGPUSWmmacIntrinsicABIdx : diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 74230a543ef1..868b1a21e3cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4801,6 +4801,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4: case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: case Intrinsic::amdgcn_wmma_f32_32x16x128_f4: + case Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4: + case Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4: case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16: diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 9264935ffad7..ce280d484da1 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1763,6 +1763,8 @@ def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16 def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>; def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 0, 0, 1>; def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1>; +def F32_32X16X128_F4_SCALE_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 0, 1>; +def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 1, 1>; def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>; def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>; def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 0, 0, 1>; @@ -1852,6 +1854,9 @@ defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64 defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">; defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE">; defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale16_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE16">; + +defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16x128_f4", F32_32X16X128_F4_SCALE_w32, "_w32">; +defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">; } // End is_wmma_xdl = 1. defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE>; @@ -2010,6 +2015,8 @@ let SubtargetPredicate = isGFX125xOnly in { defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_WMMA_w32>; defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>; defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>; + defm : WMMAPat<"V_WMMA_SCALE_F32_32X16X128_F4_w32", int_amdgcn_wmma_scale_f32_32x16x128_f4, F32_32X16X128_F4_SCALE_w32>; + defm : WMMAPat<"V_WMMA_SCALE16_F32_32X16X128_F4_w32", int_amdgcn_wmma_scale16_f32_32x16x128_f4, F32_32X16X128_F4_SCALE16_w32>; foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in { defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_f32_16x16x128_f8f6f4, !cast("F32_16X16X128_F8F6F4_" # I # "_w32")>; @@ -2191,6 +2198,15 @@ class VOP3PX2e op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO let Inst{127} = !if(P.NegLo2, src2_modifiers{0}, 0); } +multiclass VOP3PX2_Real_ScaledWMMA_F4 op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> { + defvar PS = !cast(NAME # "_twoaddr"); + let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32, + DecoderNamespace = "GFX1250" in { + def _gfx1250 : VOP3P_Real_Gen, + VOP3PX2e ; + } +} + multiclass VOP3PX2_Real_ScaledWMMA op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> { defvar PS = !cast(NAME # "_twoaddr"); defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32"))); @@ -2292,6 +2308,9 @@ defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x0 defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">; defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">; +defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x35, F32_32X16X128_F4_SCALE_w32>; +defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x3a, F32_32X16X128_F4_SCALE16_w32>; + defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>; defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>; defm V_SWMMAC_F16_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll index 1c7c625daaa7..1bf865c41427 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll @@ -2236,6 +2236,170 @@ bb: ret void } +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[42:43], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[42:43], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[42:43], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[42:43], v[24:27], off +; GISEL-NEXT: global_store_b128 v[42:43], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[42:43], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[42:43], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_ss: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_ss: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 2, i32 1, i32 %scale_src0, i32 1, i32 2, i32 %scale_src1, i1 true, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_movk_i32 s1, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 3, i32 2, i32 %scale_src0, i32 0, i32 1, i32 100, i1 false, i1 true) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v[40:41], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[44:45], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[44:45], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[44:45], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[44:45], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v[40:41], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[44:45], v[24:27], off +; GISEL-NEXT: global_store_b128 v[44:45], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[44:45], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[44:45], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 1, i32 0, i64 %scale_src0, i32 1, i32 0, i64 %scale_src1, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_ss: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_ss: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 2, i32 1, i64 %scale_src0, i32 1, i32 2, i64 %scale_src1, i1 true, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 3, i32 2, i64 %scale_src0, i32 0, i32 1, i64 100, i1 false, i1 true) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16: ; GFX1250: ; %bb.0: ; %bb @@ -2573,6 +2737,8 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>) +declare <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i64, i32, i32, i64, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll index e602c31ebd80..48303c004f1d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll @@ -2530,6 +2530,312 @@ bb: ret void } +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> , i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 true, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26 +; GFX1250-NEXT: v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26 +; GFX1250-NEXT: v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26 +; GFX1250-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26 +; GFX1250-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v26 +; GFX1250-NEXT: v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26 +; GFX1250-NEXT: v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s1, 2.0 +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s8, s0 +; GISEL-NEXT: s_mov_b32 s9, s0 +; GISEL-NEXT: s_mov_b32 s10, s0 +; GISEL-NEXT: s_mov_b32 s11, s0 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> , i32 1, i32 0, i32 1, i32 1, i32 0, i32 2, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000 +; GFX1250-NEXT: s_movk_i32 s0, 0x65 +; GFX1250-NEXT: s_movk_i32 s1, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26 +; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26 +; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26 +; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26 +; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26 +; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26 +; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26 +; GFX1250-NEXT: v_mov_b32_e32 v41, v26 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s8, s0 +; GISEL-NEXT: s_mov_b32 s9, s0 +; GISEL-NEXT: s_mov_b32 s10, s0 +; GISEL-NEXT: s_mov_b32 s11, s0 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v43, 0x65 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v42, v43 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> , i32 1, i32 0, i32 100, i32 1, i32 0, i32 101, i1 true, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> , i32 1, i32 0, i64 %scale_src0, i32 1, i32 0, i64 %scale_src1, i1 true, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26 +; GFX1250-NEXT: v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26 +; GFX1250-NEXT: v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26 +; GFX1250-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26 +; GFX1250-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v26 +; GFX1250-NEXT: v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26 +; GFX1250-NEXT: v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s1, 2.0 +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s8, s0 +; GISEL-NEXT: s_mov_b32 s9, s0 +; GISEL-NEXT: s_mov_b32 s10, s0 +; GISEL-NEXT: s_mov_b32 s11, s0 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> , i32 1, i32 0, i64 1, i32 1, i32 0, i64 2, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000 +; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65 +; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26 +; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26 +; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26 +; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26 +; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26 +; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26 +; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26 +; GFX1250-NEXT: v_mov_b32_e32 v41, v26 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s8, s0 +; GISEL-NEXT: s_mov_b32 s9, s0 +; GISEL-NEXT: s_mov_b32 s10, s0 +; GISEL-NEXT: s_mov_b32 s11, s0 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x65 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v[42:43], v[44:45] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> , i32 1, i32 0, i64 100, i32 1, i32 0, i64 101, i1 true, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1) declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1) @@ -2557,3 +2863,5 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>) +declare <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i64, i32, i32, i64, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll index 14699ce630c1..8f674f84206f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll @@ -1882,6 +1882,162 @@ bb: ret void } +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 1, <16 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 3, <16 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 4, <16 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 1, <16 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 3, <16 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 4, <16 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16_negA: ; GFX1250: ; %bb.0: ; %bb @@ -2177,6 +2333,8 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>) +declare <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i64, i32, i32, i64, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1) diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s index 93e65d3444b8..8185b77beb93 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s @@ -1737,3 +1737,173 @@ v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi: // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c] // WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 // GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW0 +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW0 +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW0 +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW0 +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 +// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s index 1eae8f6ba451..41cac9d1470a 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s @@ -449,6 +449,16 @@ v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MAT // GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 // GFX1250-ERR-NEXT: {{^}} ^ +v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47], v1, v2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8 +// GFX1250-ERR-NEXT: {{^}}v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47], v1, v2 +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP6 +// GFX1250-ERR-NEXT: {{^}}v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 +// GFX1250-ERR-NEXT: {{^}} ^ + v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0] // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand // GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0] @@ -468,3 +478,23 @@ v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,1,0] // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand // GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,1,0] // GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 neg_lo:[1,0,0] +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand +// GFX1250-ERR-NEXT: {{^}}v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 neg_lo:[1,0,0] +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_fmt:0 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT: {{^}}v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_fmt:0 +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[0:1], v[2:3] neg_lo:[1,0,0] +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand +// GFX1250-ERR-NEXT: {{^}}v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[0:1], v[2:3] neg_lo:[1,0,0] +// GFX1250-ERR-NEXT: {{^}} ^ + +v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[0:1], v[2:3] matrix_a_fmt:0 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT: {{^}}v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[0:1], v[2:3] matrix_a_fmt:0 +// GFX1250-ERR-NEXT: {{^}} ^ diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt index 2216348fa43c..a409dac321f8 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt @@ -999,3 +999,93 @@ 0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c # GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c] + +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] + +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] + +0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] + +0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] + +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] + +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] + +0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] + +0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] + +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] + +0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] + +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] + +0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] + +0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] + +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] + +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] + +0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] + +0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]