From 829afc4c91fcac3823cdf06baf683fe3356a9e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20Brku=C5=A1anin?= Date: Wed, 18 Feb 2026 19:17:48 +0100 Subject: [PATCH] [AMDGPU] Add WMMA and SWMMAC instructions for gfx1170 (#180731) Introduce two new subtarget features: - WMMA256bInsts for GFX11 WMMA instructions and - WMMA128bInsts for GFX1170 and GFX12 WMMA and SWMMAC instructions Some WMMA instructions have changed from GFX 11.0 to GFX 11.7 so new Real versions were added with "_gfx1170" suffix. For consistency all WMMA and SWMMAC GFX11.7 instructions use this suffix. To resolve decoding issues between different formats for some WMMA instructions between GFX 11 and GFX 11.7, new decoding tables were added. --- clang/include/clang/Basic/BuiltinsAMDGPU.td | 140 +- .../builtins-amdgcn-gfx12-wmma-w32.cl | 141 +- .../builtins-amdgcn-gfx12-wmma-w64.cl | 141 +- .../builtins-amdgcn-wmma-w32-gfx10-err.cl | 16 +- .../builtins-amdgcn-wmma-w64-gfx10-err.cl | 16 +- llvm/lib/Target/AMDGPU/AMDGPU.td | 27 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 2 + .../Disassembler/AMDGPUDisassembler.cpp | 10 + .../AMDGPU/Disassembler/AMDGPUDisassembler.h | 1 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 7 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 + llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 1 + llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 142 +- llvm/lib/TargetParser/TargetParser.cpp | 30 +- ...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll | 374 ++-- .../AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll | 598 +++--- .../GlobalISel/wmma-gfx12-w32-iu-modifiers.ll | 258 +-- .../wmma-gfx12-w32-swmmac-index_key.ll | 183 +- .../AMDGPU/GlobalISel/wmma-gfx12-w32.ll | 298 +-- ...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll | 291 +-- .../AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll | 442 ++--- .../GlobalISel/wmma-gfx12-w64-iu-modifiers.ll | 186 +- .../wmma-gfx12-w64-swmmac-index_key.ll | 266 ++- .../AMDGPU/GlobalISel/wmma-gfx12-w64.ll | 226 +-- ...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll | 371 ++-- .../test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll | 334 +++- .../AMDGPU/wmma-gfx12-w32-iu-modifiers.ll | 258 +-- .../AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll | 183 +- llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll | 298 +-- ...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll | 303 +-- .../test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll | 334 ++-- .../AMDGPU/wmma-gfx12-w64-iu-modifiers.ll | 186 +- .../AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll | 266 ++- llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll | 226 +-- .../CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir | 237 +-- .../CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir | 237 +-- llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s | 1529 ++++++++++++++++ llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s | 1529 ++++++++++++++++ llvm/test/MC/AMDGPU/literals.s | 8 +- .../AMDGPU/gfx1170_dasm_wmma_w32.txt | 1628 +++++++++++++++++ .../AMDGPU/gfx1170_dasm_wmma_w64.txt | 1628 +++++++++++++++++ 42 files changed, 10505 insertions(+), 2854 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s create mode 100644 llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index 78443ac291f3..86b10eba55e8 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -358,23 +358,23 @@ def __builtin_amdgcn_s_wait_event : AMDGPUBuiltin<"void(_Constant short)", [], " // Postfix w32 indicates the builtin requires wavefront size of 32. // Postfix w64 indicates the builtin requires wavefront size of 64. //===----------------------------------------------------------------------===// -def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, float>)", [Const], "gfx11-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, float>)", [Const], "gfx11-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, float>)", [Const], "wmma-256b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, float>)", [Const], "wmma-256b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<4, float>)", [Const], "gfx11-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<4, float>)", [Const], "gfx11-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<4, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<4, float>)", [Const], "wmma-256b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<4, float>)", [Const], "wmma-256b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">; def __builtin_amdgcn_s_sendmsg_rtn : AMDGPUBuiltin<"unsigned int(_Constant unsigned int)", [], "gfx11-insts">; def __builtin_amdgcn_s_sendmsg_rtnl : AMDGPUBuiltin<"uint64_t(_Constant unsigned int)", [], "gfx11-insts">; @@ -599,67 +599,71 @@ def __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn : AMDGPUBuiltin<"_ExtVector<2, // The second return value of the intrinsic is zext'ed. def __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn : AMDGPUBuiltin<"_ExtVector<2, uint64_t>(unsigned int, unsigned int, _ExtVector<8, unsigned int>, _Constant int)", [], "gfx12-insts">; +//===----------------------------------------------------------------------===// +// GFX1170, GFX12+ only builtins. +//===----------------------------------------------------------------------===// + //===----------------------------------------------------------------------===// // WMMA builtins. // Postfix w32 indicates the builtin requires wavefront size of 32. // Postfix w64 indicates the builtin requires wavefront size of 64. // -// Some of these are very similar to their GFX11 counterparts, but they don't -// require replication of the A,B matrices, so they use fewer vector elements. -// Therefore, we add an "_gfx12" suffix to distinguish them from the existing -// builtins. +// Some of these are very similar to their base GFX11 counterparts, but they +// don't require replication of the A,B matrices, so they use fewer vector +// elements. Therefore, we add an "_gfx12" suffix to distinguish them from the +// existing builtins. //===----------------------------------------------------------------------===// -def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, _Float16>)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, short>)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, int, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">; -// These are gfx12-only, but for consistency with the other WMMA variants we're -// keeping the "_gfx12" suffix. -def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, _Float16>)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, short>)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, int, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">; +// These are gfx1170 and gfx12 only, but for consistency with the other WMMA +// variants we're keeping the "_gfx12" suffix. +def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, _Float16>)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, short>)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">; -// These are gfx12-only, but for consistency with the other WMMA variants we're -// keeping the "_gfx12" suffix. -def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, _Float16>)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, short>)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">; +// These are gfx1170 and gfx12 only, but for consistency with the other WMMA +// variants we're keeping the "_gfx12" suffix. +def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">; -def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, __fp16>, int)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, short>, int)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, __fp16>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, short>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, __fp16>, int)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, short>, int)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, __fp16>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, short>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; def __builtin_amdgcn_prng_b32 : AMDGPUBuiltin<"unsigned int(unsigned int)", [Const], "prng-inst">; def __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, _Float16>, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl index 6326866ed3c3..47ae7ce82bec 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl @@ -1,6 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1170 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s typedef int v2i __attribute__((ext_vector_type(2))); typedef float v8f __attribute__((ext_vector_type(8))); @@ -14,12 +15,12 @@ typedef int v8i __attribute__((ext_vector_type(8))); // amdgcn_wmma_f32_16x16x16_f16 // -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x float> [[C]]) -// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8:![0-9]+]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x float> [[C]]) +// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8:![0-9]+]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c) { @@ -30,12 +31,12 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c) // amdgcn_wmma_f32_16x16x16_bf16 // -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x float> [[C]]) -// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x float> [[C]]) +// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c) { @@ -46,12 +47,12 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c // amdgcn_wmma_f16_16x16x16_f16 // -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], i1 false) -// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], i1 false) +// CHECK-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c) { @@ -62,12 +63,12 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c) // amdgcn_wmma_bf16_16x16x16_bf16 // -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], i1 false) -// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], i1 false) +// CHECK-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c) { @@ -78,12 +79,12 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s // amdgcn_wmma_i32_16x16x16_iu8 // -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false) -// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false) +// CHECK-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c) { @@ -94,79 +95,79 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c) // amdgcn_wmma_i32_16x16x16_iu4 // -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <8 x i32> [[C]], i1 false) -// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <8 x i32> [[C]], i1 false) +// CHECK-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c) { *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false); } -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]]) -// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]]) +// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) { *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c); } -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]]) -// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]]) +// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) { *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c); } -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]]) -// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]]) +// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c) { *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c); } -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]]) -// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]]) +// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c) { *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c); } -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false) -// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false) +// CHECK-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c) { *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false); } //. -// CHECK-GFX1200: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0} -// CHECK-GFX1200: [[META7]] = !{!"Simple C/C++ TBAA"} -// CHECK-GFX1200: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0} +// CHECK: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0} +// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"} +// CHECK: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0} //. diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl index a79c3d4da1eb..98ce84adf155 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl @@ -1,6 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1170 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s typedef float v4f __attribute__((ext_vector_type(4))); typedef half v4h __attribute__((ext_vector_type(4))); @@ -13,12 +14,12 @@ typedef int v4i __attribute__((ext_vector_type(4))); // amdgcn_wmma_f32_16x16x16_f16 // -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w64( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x float> [[C]]) -// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w64( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x float> [[C]]) +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c) { @@ -29,12 +30,12 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c) // amdgcn_wmma_f32_16x16x16_bf16 // -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w64( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x float> [[C]]) -// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w64( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x float> [[C]]) +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c) { @@ -45,12 +46,12 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c // amdgcn_wmma_f16_16x16x16_f16 // -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w64( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], i1 false) -// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w64( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], i1 false) +// CHECK-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c) { @@ -61,12 +62,12 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c) // amdgcn_wmma_bf16_16x16x16_bf16 // -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w64( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], i1 false) -// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w64( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], i1 false) +// CHECK-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c) { @@ -77,12 +78,12 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s // amdgcn_wmma_i32_16x16x16_iu8 // -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w64( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false) -// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w64( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false) +// CHECK-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c) { @@ -93,79 +94,79 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c) // amdgcn_wmma_i32_16x16x16_iu4 // -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w64( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false) -// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w64( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false) +// CHECK-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c) { *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false); } -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]]) -// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]]) +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c) { *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c); } -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]]) -// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]]) +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c) { *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c); } -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]]) -// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]]) +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c) { *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c); } -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]]) -// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]]) +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c) { *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c); } -// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32( -// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-GFX1200-NEXT: [[ENTRY:.*:]] -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false) -// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] -// CHECK-GFX1200-NEXT: ret void +// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false) +// CHECK-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]] +// CHECK-NEXT: ret void // void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c) { *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false); } //. -// CHECK-GFX1200: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0} -// CHECK-GFX1200: [[META7]] = !{!"Simple C/C++ TBAA"} -// CHECK-GFX1200: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0} +// CHECK: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0} +// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"} +// CHECK: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0} //. diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl index a1a56f0d8417..ed72a8ee7dbd 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl @@ -21,14 +21,14 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b global v16s* out16s, v2i a2i, v2i b2i, v16s c16s, global v8i* out8i, v4i a4i, v4i b4i, v8i c8i) { - *out8f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a16h, b16h, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w32' needs target feature gfx11-insts,wavefrontsize32}} - *out8f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a16s, b16s, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32' needs target feature gfx11-insts,wavefrontsize32}} - *out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w32' needs target feature gfx11-insts,wavefrontsize32}} - *out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32' needs target feature gfx11-insts,wavefrontsize32}} - *out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32' needs target feature gfx11-insts,wavefrontsize32}} - *out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32' needs target feature gfx11-insts,wavefrontsize32}} - *out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a4i, true, b4i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32' needs target feature gfx11-insts,wavefrontsize32}} - *out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32(true, a2i, true, b2i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32' needs target feature gfx11-insts,wavefrontsize32}} + *out8f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a16h, b16h, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w32' needs target feature wmma-256b-insts,wavefrontsize32}} + *out8f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a16s, b16s, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32' needs target feature wmma-256b-insts,wavefrontsize32}} + *out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w32' needs target feature wmma-256b-insts,wavefrontsize32}} + *out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32' needs target feature wmma-256b-insts,wavefrontsize32}} + *out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32' needs target feature wmma-256b-insts,wavefrontsize32}} + *out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32' needs target feature wmma-256b-insts,wavefrontsize32}} + *out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a4i, true, b4i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32' needs target feature wmma-256b-insts,wavefrontsize32}} + *out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32(true, a2i, true, b2i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32' needs target feature wmma-256b-insts,wavefrontsize32}} } #endif diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64-gfx10-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64-gfx10-err.cl index d995b1dc46be..4b1808fe6d6e 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64-gfx10-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64-gfx10-err.cl @@ -21,14 +21,14 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out4f, v16h a16h, v16h b global v8s* out8s, v4i a4i, v4i b4i, v8s c8s, global v4i* out4i, v2i a2i, v2i b2i, v4i c4i) { - *out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target feature gfx11-insts,wavefrontsize64}} - *out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target feature gfx11-insts,wavefrontsize64}} - *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target feature gfx11-insts,wavefrontsize64}} - *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target feature gfx11-insts,wavefrontsize64}} - *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' needs target feature gfx11-insts,wavefrontsize64}} - *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' needs target feature gfx11-insts,wavefrontsize64}} - *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' needs target feature gfx11-insts,wavefrontsize64}} - *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' needs target feature gfx11-insts,wavefrontsize64}} + *out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target feature wmma-256b-insts,wavefrontsize64}} + *out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target feature wmma-256b-insts,wavefrontsize64}} + *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target feature wmma-256b-insts,wavefrontsize64}} + *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target feature wmma-256b-insts,wavefrontsize64}} + *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' needs target feature wmma-256b-insts,wavefrontsize64}} + *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' needs target feature wmma-256b-insts,wavefrontsize64}} + *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' needs target feature wmma-256b-insts,wavefrontsize64}} + *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' needs target feature wmma-256b-insts,wavefrontsize64}} } #endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 9ad2f2e11fbc..91c0c62326ce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -775,6 +775,14 @@ defm CvtFP8VOP1Bug : AMDGPUSubtargetFeature<"cvt-fp8-vop1-bug", [FeatureFP8ConversionInsts] >; +defm WMMA256bInsts : AMDGPUSubtargetFeature<"wmma-256b-insts", + "Has WMMA instructions where A and B matrices have duplicated data" +>; + +defm WMMA128bInsts : AMDGPUSubtargetFeature<"wmma-128b-insts", + "Has WMMA instructions where A and B matrices do not have duplicated data" +>; + defm PkFmacF16Inst : AMDGPUSubtargetFeature<"pk-fmac-f16-inst", "Has v_pk_fmac_f16 instruction" >; @@ -1820,9 +1828,9 @@ def FeatureISAVersion11_Common : FeatureSet< FeatureD16Writes32BitVgpr, ]>; -// There are few workarounds that need to be -// added to all targets. This pessimizes codegen -// a bit on the generic GFX11 target. +// There are few workarounds that need to be added to all targets. This +// pessimizes codegen a bit on the generic GFX11 target. This generic target +// does not include GFX1170 due to incompatible changes. def FeatureISAVersion11_Generic: FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, [FeatureMSAALoadDstSelBug, @@ -1831,14 +1839,16 @@ def FeatureISAVersion11_Generic: FeatureSet< FeatureMADIntraFwdBug, FeaturePrivEnabledTrap2NopBug, FeatureRequiresCOV6, - FeatureRequiredExportPriority])>; + FeatureRequiredExportPriority, + FeatureWMMA256bInsts])>; def FeatureISAVersion11_0_Common : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, [FeatureMSAALoadDstSelBug, FeatureVALUTransUseHazard, FeatureMADIntraFwdBug, - FeaturePrivEnabledTrap2NopBug])>; + FeaturePrivEnabledTrap2NopBug, + FeatureWMMA256bInsts])>; def FeatureISAVersion11_0_0 : FeatureSet< !listconcat(FeatureISAVersion11_0_Common.Features, @@ -1861,7 +1871,8 @@ def FeatureISAVersion11_5_Common : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, [FeatureSALUFloatInsts, FeatureDPPSrc1SGPR, - FeatureRequiredExportPriority])>; + FeatureRequiredExportPriority, + FeatureWMMA256bInsts])>; def FeatureISAVersion11_5_0 : FeatureSet< !listconcat(FeatureISAVersion11_5_Common.Features, @@ -1885,7 +1896,8 @@ def FeatureISAVersion11_7_0 : FeatureSet< [FeatureSALUFloatInsts, FeatureDPPSrc1SGPR, FeatureFP8ConversionInsts, - FeatureDot11Insts])>; + FeatureDot11Insts, + FeatureWMMA128bInsts])>; def FeatureISAVersion12 : FeatureSet< [FeatureGFX12, @@ -1915,6 +1927,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureImageInsts, FeatureExtendedImageInsts, FeatureFP8ConversionInsts, + FeatureWMMA128bInsts, FeatureIEEEMinimumMaximumInsts, FeaturePackedTID, FeatureVcmpxPermlaneHazard, diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 998a9d0910a0..01cc4ff4ae85 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1556,6 +1556,8 @@ public: return AMDGPU::isGFX11Plus(getSTI()); } + bool isGFX1170() const { return AMDGPU::isGFX1170(getSTI()); } + bool isGFX12() const { return AMDGPU::isGFX12(getSTI()); } bool isGFX12Plus() const { return AMDGPU::isGFX12Plus(getSTI()); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index b2dfd098735a..2309a56f612f 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -686,11 +686,19 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Address, CS)) break; + if (isGFX1170() && + tryDecodeInst(DecoderTableGFX117064, MI, QW, Address, CS)) + break; + if (isGFX11() && tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW, Address, CS)) break; + if (isGFX1170() && + tryDecodeInst(DecoderTableGFX1170W6464, MI, QW, Address, CS)) + break; + if (isGFX11() && tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS)) break; @@ -2247,6 +2255,8 @@ bool AMDGPUDisassembler::isGFX11Plus() const { return AMDGPU::isGFX11Plus(STI); } +bool AMDGPUDisassembler::isGFX1170() const { return AMDGPU::isGFX1170(STI); } + bool AMDGPUDisassembler::isGFX12() const { return STI.hasFeature(AMDGPU::FeatureGFX12); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 28f71d8d7556..b01eb8dd59fa 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -178,6 +178,7 @@ public: bool isGFX10() const; bool isGFX10Plus() const; bool isGFX11() const; + bool isGFX1170() const; bool isGFX11Plus() const; bool isGFX12() const; bool isGFX12Plus() const; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index b308e0d77305..2365b6175a46 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -396,6 +396,10 @@ public: return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); } + bool isGFX1170() const { + return getGeneration() == GFX11 && hasWMMA128bInsts(); + } + bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; } bool hasAtomicFaddInsts() const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index f063b4eb7777..c2396674e4f9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -44,9 +44,10 @@ class GFXGen { def GFX13Gen : GFXGen; def GFX1250Gen : GFXGen; def GFX12Not12_50Gen : GFXGen; -def GFX12Gen : GFXGen; -def GFX11Gen : GFXGen; -def GFX10Gen : GFXGen; +def GFX12Gen : GFXGen; +def GFX1170Gen : GFXGen; +def GFX11Gen : GFXGen; +def GFX10Gen : GFXGen; //===----------------------------------------------------------------------===// // SI DAG Nodes diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 3f32d1166fc8..8ccbbb5f0320 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2598,6 +2598,10 @@ bool isGFX11(const MCSubtargetInfo &STI) { return STI.hasFeature(AMDGPU::FeatureGFX11); } +bool isGFX1170(const MCSubtargetInfo &STI) { + return isGFX11(STI) && STI.hasFeature(AMDGPU::FeatureWMMA128bInsts); +} + bool isGFX11Plus(const MCSubtargetInfo &STI) { return isGFX11(STI) || isGFX12Plus(STI); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 7500c2481a4b..1035134a837d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1705,6 +1705,7 @@ bool isGFX10Plus(const MCSubtargetInfo &STI); bool isNotGFX10Plus(const MCSubtargetInfo &STI); bool isGFX10Before1030(const MCSubtargetInfo &STI); bool isGFX11(const MCSubtargetInfo &STI); +bool isGFX1170(const MCSubtargetInfo &STI); bool isGFX11Plus(const MCSubtargetInfo &STI); bool isGFX12(const MCSubtargetInfo &STI); bool isGFX12Plus(const MCSubtargetInfo &STI); diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 9a4054b8ad24..9e62dc7c9db0 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1426,22 +1426,18 @@ multiclass WMMAInst; - let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { - let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in { - def _twoaddr # Suffix : VOP3P_Pseudo; - } - } - if convertibleTo3Addr then { + + let SubtargetPredicate = HasWMMA256bInsts in { let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { - let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { - def _threeaddr # Suffix : VOP3P_Pseudo; + let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in { + def _twoaddr # Suffix : VOP3P_Pseudo; + } + if convertibleTo3Addr then { + let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { + def _threeaddr # Suffix : VOP3P_Pseudo; + } } } - def : WMMAOpcodeMapping(NAME # _twoaddr # Suffix), - !cast(NAME # _threeaddr # Suffix)>; - } - - let SubtargetPredicate = isGFX11Only in { if !eq(Type, WMMAOpSel) then { def : WMMAOpSelPat(NAME # _twoaddr # Suffix), node, P>; } else if !eq(Type, WMMAUIClamp) then { @@ -1450,6 +1446,11 @@ multiclass WMMAInst(NAME # _twoaddr # Suffix), node, P>; } } + + if convertibleTo3Addr then { + def : WMMAOpcodeMapping(NAME # _twoaddr # Suffix), + !cast(NAME # _threeaddr # Suffix)>; + } } @@ -1727,7 +1728,7 @@ multiclass WMMAInstGFX12, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; @@ -2047,7 +2048,7 @@ class SWMMACPat_w64; defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>; defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>; @@ -2074,7 +2075,7 @@ let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12PlusNot12_50 in { def : SWMMACPat; } -let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in { +let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX11PlusNot12_50, OtherPredicates = [HasWMMA128bInsts] in { defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>; defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>; defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>; @@ -2229,6 +2230,18 @@ multiclass VOP3P_WMMA_Real_Base op, VOP3PWMMA_Profile WMMAP, VOP3PeWmma(backing_ps_name).Pfl, WMMAP>; } +multiclass VOP3P_Real_WMMA_gfx1170 op, VOP3PWMMA_Profile WMMAP> { + let WaveSizePredicate = isWave32, DecoderNamespace = "GFX1170" in { + defm _twoaddr : VOP3P_WMMA_Real_Base ; + } +} + +multiclass VOP3P_Real_WMMA_gfx1170w64 op, VOP3PWMMA_Profile WMMAP> { + let WaveSizePredicate = isWave64, DecoderNamespace = "GFX1170W64" in { + defm _twoaddr : VOP3P_WMMA_Real_Base ; + } +} + multiclass VOP3P_Real_WMMA_gfx12 op, VOP3PWMMA_Profile WMMAP> { let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in { defm _twoaddr : VOP3P_WMMA_Real_Base ; @@ -2241,6 +2254,14 @@ multiclass VOP3P_Real_WMMA_gfx12w64 op, VOP3PWMMA_Profile WMMAP> { } } +multiclass VOP3P_Real_WMMA_gfx1170_gfx12 op, VOP3PWMMA_Profile WMMAP> : + VOP3P_Real_WMMA_gfx1170, + VOP3P_Real_WMMA_gfx12; + +multiclass VOP3P_Real_WMMA_gfx1170_gfx12w64 op, VOP3PWMMA_Profile WMMAP> : + VOP3P_Real_WMMA_gfx1170w64, + VOP3P_Real_WMMA_gfx12w64; + multiclass VOP3P_Real_WMMA_gfx1250 op, VOP3PWMMA_Profile WMMAP> { let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in { defm _twoaddr : VOP3P_WMMA_Real_Base ; @@ -2345,54 +2366,53 @@ multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats op, bits<8> Ld } } -defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>; -defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>; -defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>; -defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>; -defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>; -defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>; -defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>; -defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>; -defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>; -defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>; -defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>; +defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x040, F32_F16_WMMA_w32>; +defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x041, F32_BF16_WMMA_w32>; +defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x042, F16_F16_WMMA_w32>; +defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x043, BF16_BF16_WMMA_w32>; +defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x044, I32_IU8_WMMA_w32>; +defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x045, I32_IU4X16_WMMA_w32>; +defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x046, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x047, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x048, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x049, F32_FP8BF8_WMMA_w32>; +defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x04a, I32_IU4X32_WMMA_w32>; -defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>; -defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>; -defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>; -defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>; -defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>; -defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>; -defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>; -defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>; -defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>; -defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>; -defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>; +defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x040, F32_F16_WMMA_w64>; +defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x041, F32_BF16_WMMA_w64>; +defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x042, F16_F16_WMMA_w64>; +defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x043, BF16_BF16_WMMA_w64>; +defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x044, I32_IU8_WMMA_w64>; +defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>; +defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>; +defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>; +defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x050, F32_F16_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x051, F32_BF16_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x052, F16_F16_SWMMAC_w32>; +defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x053, BF16_BF16_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x054, I32_IU8_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>; -defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>; -defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>; -defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>; -defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>; -defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>; -defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>; -defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>; -defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>; -defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>; -defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>; -defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>; - -defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>; -defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>; -defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>; -defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>; -defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>; -defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>; -defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>; -defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>; -defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>; -defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>; -defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x050, F32_F16_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>; +defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x052, F16_F16_SWMMAC_w64>; +defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>; +defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>; +defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>; +defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>; +defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>; defm V_WMMA_F32_16X16X4_F32_w32 : VOP3P_Real_WMMA_gfx1250 <0x05d, F32_F32_WMMA_w32>; defm V_WMMA_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x062, F32_BF16X32_WMMA_w32>; diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index fc5d7519bdff..d5f8b9b2c572 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -515,13 +515,38 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["qsad-insts"] = true; Features["cvt-pknorm-vop2-insts"] = true; Features["fp8-conversion-insts"] = true; + Features["wmma-128b-insts"] = true; Features["atomic-fmin-fmax-global-f32"] = true; break; case GK_GFX1170: - // TODO-GFX1170: Update features map for gfx1170 + Features["ci-insts"] = true; + Features["dot5-insts"] = true; + Features["dot7-insts"] = true; + Features["dot8-insts"] = true; + Features["dot9-insts"] = true; + Features["dot10-insts"] = true; + Features["dot12-insts"] = true; + Features["dl-insts"] = true; + Features["16-bit-insts"] = true; + Features["dpp"] = true; + Features["gfx8-insts"] = true; + Features["gfx9-insts"] = true; + Features["gfx10-insts"] = true; + Features["gfx10-3-insts"] = true; + Features["gfx11-insts"] = true; + Features["atomic-fadd-rtn-insts"] = true; + Features["image-insts"] = true; + Features["cube-insts"] = true; + Features["lerp-inst"] = true; + Features["sad-insts"] = true; + Features["qsad-insts"] = true; + Features["cvt-pknorm-vop2-insts"] = true; + Features["gws"] = true; Features["dot11-insts"] = true; Features["fp8-conversion-insts"] = true; - [[fallthrough]]; + Features["wmma-128b-insts"] = true; + Features["atomic-fmin-fmax-global-f32"] = true; + break; case GK_GFX1153: case GK_GFX1152: case GK_GFX1151: @@ -554,6 +579,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["qsad-insts"] = true; Features["cvt-pknorm-vop2-insts"] = true; Features["gws"] = true; + Features["wmma-256b-insts"] = true; Features["atomic-fmin-fmax-global-f32"] = true; break; case GK_GFX1036: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll index 9693d544d153..450cd0701911 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll @@ -1,14 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C) @@ -17,13 +18,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C) @@ -32,13 +33,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C) @@ -47,13 +48,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C) @@ -62,13 +63,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C) @@ -77,13 +78,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C) @@ -92,11 +93,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0) @@ -105,11 +106,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0) @@ -118,11 +119,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x half> %C %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0) @@ -131,11 +132,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0) @@ -144,13 +145,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) @@ -159,13 +160,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) @@ -174,13 +175,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) @@ -189,13 +190,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) @@ -204,13 +205,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) @@ -219,13 +220,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) @@ -234,13 +235,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) @@ -249,13 +250,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) @@ -264,13 +265,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off -; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off +; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index) @@ -279,13 +280,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off -; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off +; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <16 x half> %B %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index) @@ -294,11 +295,11 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index) @@ -307,11 +308,11 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <16 x half> %B %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index) @@ -322,13 +323,13 @@ bb: ; both neg and abs patterns (wmma matrix C f32 or f16 ) define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %fneg.fabs.C = fneg <8 x float> %fabs.C @@ -338,11 +339,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) %fneg.fabs.C = fneg <8 x half> %fabs.C @@ -352,15 +353,15 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: s_endpgm bb: %el3 = extractelement <8 x float> %C, i32 3 %el3.fabs = call float @llvm.fabs.f32(float %el3) @@ -374,13 +375,13 @@ bb: ; A or B matrix modifier and constant in C define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> ) @@ -389,11 +390,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> , i1 0) @@ -404,6 +405,27 @@ bb: ; pack f16 elements with v_perm_b32 since they don't come from same b32 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: flat_load_b128 v[12:15], v[8:9] +; GFX1170-NEXT: flat_load_b128 v[16:19], v[8:9] offset:16 +; GFX1170-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX1170-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; GFX1170-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; GFX1170-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1170-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX1170-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; GFX1170-NEXT: v_lshl_or_b32 v12, v13, 16, v8 +; GFX1170-NEXT: v_lshl_or_b32 v13, v15, 16, v9 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1170-NEXT: v_lshl_or_b32 v14, v17, 16, v14 +; GFX1170-NEXT: v_lshl_or_b32 v15, v19, 16, v16 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1] +; GFX1170-NEXT: global_store_b128 v[10:11], v[12:15], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll index 6b749df71223..8f8267952cbe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll @@ -1,14 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -16,27 +17,27 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: s_mov_b32 s5, s0 -; GFX12-NEXT: s_mov_b32 s6, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 -; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 -; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 -; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s7, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s0 +; GCN-NEXT: s_mov_b32 s6, s0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GCN-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 +; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -44,13 +45,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -58,27 +59,27 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: s_mov_b32 s5, s0 -; GFX12-NEXT: s_mov_b32 s6, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 -; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 -; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 -; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s7, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s0 +; GCN-NEXT: s_mov_b32 s6, s0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GCN-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 +; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -86,11 +87,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) store <8 x half> %res, ptr addrspace(1) %out @@ -98,19 +99,19 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x42004200 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 -; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x42004200 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) store <8 x half> %res, ptr addrspace(1) %out @@ -118,19 +119,19 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 -; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x3f803f80 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) store <8 x i16> %res, ptr addrspace(1) %out @@ -138,19 +139,19 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 -; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x3fc03fc0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) store <8 x i16> %res, ptr addrspace(1) %out @@ -158,13 +159,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -172,27 +173,27 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_movk_i32 s0, 0x80 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: s_mov_b32 s5, s0 -; GFX12-NEXT: s_mov_b32 s6, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 -; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 -; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 -; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_movk_i32 s0, 0x80 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s7, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s0 +; GCN-NEXT: s_mov_b32 s6, s0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -200,13 +201,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -214,27 +215,27 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_movk_i32 s0, 0x80 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: s_mov_b32 s5, s0 -; GFX12-NEXT: s_mov_b32 s6, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6 -; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 -; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2 -; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_movk_i32 s0, 0x80 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s7, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s0 +; GCN-NEXT: s_mov_b32 s6, s0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6 +; GCN-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 +; GCN-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2 +; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -242,13 +243,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -256,27 +257,27 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: s_mov_b32 s5, s0 -; GFX12-NEXT: s_mov_b32 s6, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 -; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 -; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 -; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s7, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s0 +; GCN-NEXT: s_mov_b32 s6, s0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -284,13 +285,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -298,27 +299,27 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: s_mov_b32 s5, s0 -; GFX12-NEXT: s_mov_b32 s6, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 -; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 -; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 -; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s7, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s0 +; GCN-NEXT: s_mov_b32 s6, s0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -326,13 +327,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -340,27 +341,27 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: s_mov_b32 s5, s0 -; GFX12-NEXT: s_mov_b32 s6, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 -; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 -; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 -; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s7, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s0 +; GCN-NEXT: s_mov_b32 s6, s0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -368,13 +369,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -382,27 +383,27 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: s_mov_b32 s5, s0 -; GFX12-NEXT: s_mov_b32 s6, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 -; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 -; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 -; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s7, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s0 +; GCN-NEXT: s_mov_b32 s6, s0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -410,13 +411,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -424,27 +425,27 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_movk_i32 s0, 0x80 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: s_mov_b32 s5, s0 -; GFX12-NEXT: s_mov_b32 s6, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 -; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 -; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 -; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_movk_i32 s0, 0x80 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s7, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s0 +; GCN-NEXT: s_mov_b32 s6, s0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 +; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -473,3 +474,6 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1170: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll index 929a51bfff53..37900d6db102 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll @@ -1,14 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -16,13 +17,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -30,13 +31,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) store <8 x i32> %res, ptr addrspace(1) %out @@ -46,13 +47,13 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off +; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -60,13 +61,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off +; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -74,13 +75,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off +; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1) store <8 x i32> %res, ptr addrspace(1) %out @@ -90,13 +91,13 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -104,13 +105,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -118,13 +119,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) store <8 x i32> %res, ptr addrspace(1) %out @@ -136,13 +137,13 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -150,13 +151,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -164,13 +165,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) store <8 x i32> %res, ptr addrspace(1) %out @@ -180,13 +181,13 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off -; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off +; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -194,13 +195,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off -; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off +; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -208,13 +209,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off -; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off +; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) store <8 x i32> %res, ptr addrspace(1) %out @@ -224,13 +225,13 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -238,13 +239,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -252,13 +253,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1) store <8 x i32> %res, ptr addrspace(1) %out @@ -271,3 +272,6 @@ declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1170: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll index 7c0f72606a5b..a3d0da7dfc14 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll @@ -1,7 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v20, v[20:21], off +; GFX1170-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX1170-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX1170-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX1170-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX1170-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v20, v[20:21], off @@ -32,6 +52,25 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v20, v[20:21], off +; GFX1170-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX1170-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX1170-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX1170-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX1170-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v20, v[20:21], off @@ -62,6 +101,19 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v16, v[16:17], off +; GFX1170-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX1170-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16 +; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX1170-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v16, v[16:17], off @@ -86,6 +138,19 @@ bb: } define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v16, v[16:17], off +; GFX1170-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX1170-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16 +; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX1170-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v16, v[16:17], off @@ -110,6 +175,25 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v14, v[14:15], off +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14 +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v14, v[14:15], off @@ -140,6 +224,25 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v11, v[11:12], off +; GFX1170-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9 +; GFX1170-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7 +; GFX1170-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5 +; GFX1170-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11 +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[13:14], v[17:20], off +; GFX1170-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX1170-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16 +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v11, v[11:12], off @@ -170,6 +273,25 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v14, v[14:15], off +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v14, v[14:15], off @@ -200,6 +322,25 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v14, v[14:15], off +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v14, v[14:15], off @@ -230,6 +371,25 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v14, v[14:15], off +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v14, v[14:15], off @@ -260,6 +420,25 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v14, v[14:15], off +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v14, v[14:15], off @@ -299,3 +478,5 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll index da61bc475887..4eacdbe171e3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll @@ -1,14 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out @@ -16,13 +17,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out @@ -30,11 +31,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0) store <8 x half> %res, ptr addrspace(1) %out @@ -42,11 +43,11 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_bf16_16x16x16_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0) store <8 x i16> %res, ptr addrspace(1) %out @@ -54,13 +55,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -68,13 +69,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off +; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -82,13 +83,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out @@ -96,13 +97,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out @@ -110,13 +111,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out @@ -124,13 +125,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out @@ -138,13 +139,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -153,13 +154,13 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off -; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off +; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) store <8 x float> %res, ptr addrspace(1) %out @@ -167,13 +168,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off -; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off +; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) store <8 x float> %res, ptr addrspace(1) %out @@ -181,11 +182,11 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 -; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f16_16x16x32_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) store <8 x half> %res, ptr addrspace(1) %out @@ -193,11 +194,11 @@ bb: } define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 -; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) store <8 x i16> %res, ptr addrspace(1) %out @@ -205,13 +206,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -219,13 +220,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off -; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off +; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -233,13 +234,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -247,13 +248,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) store <8 x float> %res, ptr addrspace(1) %out @@ -261,13 +262,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) store <8 x float> %res, ptr addrspace(1) %out @@ -275,13 +276,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) store <8 x float> %res, ptr addrspace(1) %out @@ -289,13 +290,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) store <8 x float> %res, ptr addrspace(1) %out @@ -324,3 +325,6 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1170: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll index a345ee6def7a..3886a072b176 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64,-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C) @@ -15,11 +16,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <4 x half> %B %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C) @@ -28,11 +29,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C) @@ -41,11 +42,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C) @@ -54,11 +55,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C) @@ -67,11 +68,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C) @@ -80,11 +81,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0) @@ -93,11 +94,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <4 x half> %B %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0) @@ -106,11 +107,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x half> %C %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0) @@ -119,11 +120,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0) @@ -132,11 +133,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) @@ -145,11 +146,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) @@ -158,11 +159,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) @@ -171,11 +172,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) @@ -184,11 +185,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) @@ -197,11 +198,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) @@ -210,11 +211,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) @@ -223,11 +224,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) @@ -236,11 +237,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index) @@ -249,11 +250,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index) @@ -262,11 +263,11 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index) @@ -275,11 +276,11 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index) @@ -290,11 +291,11 @@ bb: ; both neg and abs patterns (wmma matrix C f32 or f16 ) define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %fneg.fabs.C = fneg <4 x float> %fabs.C @@ -304,11 +305,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) %fneg.fabs.C = fneg <4 x half> %fabs.C @@ -318,13 +319,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %el3 = extractelement <4 x float> %C, i32 3 %el3.fabs = call float @llvm.fabs.f32(float %el3) @@ -338,11 +339,11 @@ bb: ; A or B matrix modifier and constant in C define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> ) @@ -351,11 +352,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <4 x half> %B %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> , i1 0) @@ -366,6 +367,20 @@ bb: ; pack f16 elements with v_perm_b32 since they don't come from same b32 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: flat_load_b128 v[8:11], v[4:5] +; GFX1170-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1170-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GFX1170-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1170-NEXT: v_lshl_or_b32 v4, v9, 16, v4 +; GFX1170-NEXT: v_lshl_or_b32 v5, v11, 16, v5 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX1170-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll index 5344ab8da1ad..ce9b8f9fc3c1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -14,21 +15,21 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v9, s3 -; GFX12-NEXT: v_mov_b32_e32 v8, s2 -; GFX12-NEXT: v_mov_b32_e32 v7, s1 -; GFX12-NEXT: v_mov_b32_e32 v6, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9] -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s3 +; GCN-NEXT: v_mov_b32_e32 v8, s2 +; GCN-NEXT: v_mov_b32_e32 v7, s1 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9] +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -36,11 +37,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -48,21 +49,21 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v9, s3 -; GFX12-NEXT: v_mov_b32_e32 v8, s2 -; GFX12-NEXT: v_mov_b32_e32 v7, s1 -; GFX12-NEXT: v_mov_b32_e32 v6, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9] -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s3 +; GCN-NEXT: v_mov_b32_e32 v8, s2 +; GCN-NEXT: v_mov_b32_e32 v7, s1 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9] +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -70,11 +71,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 +; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) store <4 x half> %res, ptr addrspace(1) %out @@ -82,17 +83,17 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x42004200 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s1 -; GFX12-NEXT: v_mov_b32_e32 v6, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] -; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x42004200 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s1 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] +; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) store <4 x half> %res, ptr addrspace(1) %out @@ -100,17 +101,17 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s1 -; GFX12-NEXT: v_mov_b32_e32 v6, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] -; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x3f803f80 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s1 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] +; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) store <4 x i16> %res, ptr addrspace(1) %out @@ -118,17 +119,17 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s1 -; GFX12-NEXT: v_mov_b32_e32 v6, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] -; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x3fc03fc0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s1 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] +; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) store <4 x i16> %res, ptr addrspace(1) %out @@ -136,11 +137,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -148,21 +149,21 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_movk_i32 s0, 0x80 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 -; GFX12-NEXT: v_mov_b32_e32 v6, s2 -; GFX12-NEXT: v_mov_b32_e32 v5, s1 -; GFX12-NEXT: v_mov_b32_e32 v4, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_movk_i32 s0, 0x80 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -170,11 +171,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -182,21 +183,21 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_movk_i32 s0, 0x80 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 -; GFX12-NEXT: v_mov_b32_e32 v6, s2 -; GFX12-NEXT: v_mov_b32_e32 v5, s1 -; GFX12-NEXT: v_mov_b32_e32 v4, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_movk_i32 s0, 0x80 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -204,11 +205,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -216,21 +217,21 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 -; GFX12-NEXT: v_mov_b32_e32 v6, s2 -; GFX12-NEXT: v_mov_b32_e32 v5, s1 -; GFX12-NEXT: v_mov_b32_e32 v4, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -238,11 +239,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -250,21 +251,21 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 -; GFX12-NEXT: v_mov_b32_e32 v6, s2 -; GFX12-NEXT: v_mov_b32_e32 v5, s1 -; GFX12-NEXT: v_mov_b32_e32 v4, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -272,11 +273,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -284,21 +285,21 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 -; GFX12-NEXT: v_mov_b32_e32 v6, s2 -; GFX12-NEXT: v_mov_b32_e32 v5, s1 -; GFX12-NEXT: v_mov_b32_e32 v4, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -306,11 +307,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -318,21 +319,21 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 -; GFX12-NEXT: v_mov_b32_e32 v6, s2 -; GFX12-NEXT: v_mov_b32_e32 v5, s1 -; GFX12-NEXT: v_mov_b32_e32 v4, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -340,11 +341,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -352,21 +353,21 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_movk_i32 s0, 0x80 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 -; GFX12-NEXT: v_mov_b32_e32 v6, s2 -; GFX12-NEXT: v_mov_b32_e32 v5, s1 -; GFX12-NEXT: v_mov_b32_e32 v4, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_movk_i32 s0, 0x80 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -384,3 +385,6 @@ declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1170: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll index e47350db4003..a87163b0dca1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -14,11 +15,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -26,11 +27,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) store <4 x i32> %res, ptr addrspace(1) %out @@ -40,11 +41,11 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -52,11 +53,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -64,11 +65,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) store <4 x i32> %res, ptr addrspace(1) %out @@ -78,11 +79,11 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -90,11 +91,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -102,11 +103,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) store <4 x i32> %res, ptr addrspace(1) %out @@ -119,11 +120,11 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -131,11 +132,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -143,11 +144,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1) store <4 x i32> %res, ptr addrspace(1) %out @@ -157,11 +158,11 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] -; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] +; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -169,11 +170,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] -; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] +; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -181,11 +182,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp -; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp +; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1) store <4 x i32> %res, ptr addrspace(1) %out @@ -195,11 +196,11 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -207,11 +208,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -219,11 +220,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1) store <4 x i32> %res, ptr addrspace(1) %out @@ -236,3 +237,6 @@ declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 imma declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1170: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll index da6852042f7f..7d31e262b486 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll @@ -1,7 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v10, v[10:11], off +; GFX1170-NEXT: v_mov_b32_e32 v23, v9 +; GFX1170-NEXT: v_mov_b32_e32 v22, v8 +; GFX1170-NEXT: v_mov_b32_e32 v21, v7 +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v9 +; GFX1170-NEXT: v_mov_b32_e32 v26, v8 +; GFX1170-NEXT: v_mov_b32_e32 v25, v7 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v31, v9 +; GFX1170-NEXT: v_mov_b32_e32 v30, v8 +; GFX1170-NEXT: v_mov_b32_e32 v29, v7 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX1170-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX1170-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX1170-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v10, v[10:11], off @@ -46,6 +74,33 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v10, v[10:11], off +; GFX1170-NEXT: v_mov_b32_e32 v23, v9 +; GFX1170-NEXT: v_mov_b32_e32 v22, v8 +; GFX1170-NEXT: v_mov_b32_e32 v21, v7 +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v9 +; GFX1170-NEXT: v_mov_b32_e32 v26, v8 +; GFX1170-NEXT: v_mov_b32_e32 v25, v7 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v31, v9 +; GFX1170-NEXT: v_mov_b32_e32 v30, v8 +; GFX1170-NEXT: v_mov_b32_e32 v29, v7 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX1170-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX1170-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX1170-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v10, v[10:11], off @@ -90,6 +145,27 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v22, v[8:9], off +; GFX1170-NEXT: v_mov_b32_e32 v9, v7 +; GFX1170-NEXT: v_mov_b32_e32 v8, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v7 +; GFX1170-NEXT: v_mov_b32_e32 v18, v6 +; GFX1170-NEXT: v_mov_b32_e32 v21, v7 +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22 +; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX1170-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX1170-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX1170-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX1170-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v22, v[8:9], off @@ -128,6 +204,27 @@ bb: } define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v22, v[8:9], off +; GFX1170-NEXT: v_mov_b32_e32 v9, v7 +; GFX1170-NEXT: v_mov_b32_e32 v8, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v7 +; GFX1170-NEXT: v_mov_b32_e32 v18, v6 +; GFX1170-NEXT: v_mov_b32_e32 v21, v7 +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22 +; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX1170-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX1170-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX1170-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX1170-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v22, v[8:9], off @@ -166,6 +263,33 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v7, v[7:8], off +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v5 +; GFX1170-NEXT: v_mov_b32_e32 v18, v4 +; GFX1170-NEXT: v_mov_b32_e32 v17, v3 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v23, v5 +; GFX1170-NEXT: v_mov_b32_e32 v22, v4 +; GFX1170-NEXT: v_mov_b32_e32 v21, v3 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v5 +; GFX1170-NEXT: v_mov_b32_e32 v26, v4 +; GFX1170-NEXT: v_mov_b32_e32 v25, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7 +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v7, v[7:8], off @@ -210,6 +334,21 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v6, v[6:7], off +; GFX1170-NEXT: v_mov_b32_e32 v15, v5 +; GFX1170-NEXT: v_mov_b32_e32 v14, v4 +; GFX1170-NEXT: v_mov_b32_e32 v13, v3 +; GFX1170-NEXT: v_mov_b32_e32 v12, v2 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6 +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +; GFX1170-NEXT: global_store_b128 v[8:9], v[12:15], off +; GFX1170-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v6, v[6:7], off @@ -236,6 +375,21 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_i32_16x16x64_iu4_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v7, v[7:8], off +; GFX1170-NEXT: v_mov_b32_e32 v16, v6 +; GFX1170-NEXT: v_mov_b32_e32 v15, v5 +; GFX1170-NEXT: v_mov_b32_e32 v14, v4 +; GFX1170-NEXT: v_mov_b32_e32 v13, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7 +; GFX1170-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +; GFX1170-NEXT: global_store_b128 v[9:10], v[13:16], off +; GFX1170-NEXT: global_store_b128 v[11:12], v[3:6], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v7, v[7:8], off @@ -262,6 +416,33 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v7, v[7:8], off +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v5 +; GFX1170-NEXT: v_mov_b32_e32 v18, v4 +; GFX1170-NEXT: v_mov_b32_e32 v17, v3 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v23, v5 +; GFX1170-NEXT: v_mov_b32_e32 v22, v4 +; GFX1170-NEXT: v_mov_b32_e32 v21, v3 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v5 +; GFX1170-NEXT: v_mov_b32_e32 v26, v4 +; GFX1170-NEXT: v_mov_b32_e32 v25, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v7, v[7:8], off @@ -306,6 +487,33 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v7, v[7:8], off +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v5 +; GFX1170-NEXT: v_mov_b32_e32 v18, v4 +; GFX1170-NEXT: v_mov_b32_e32 v17, v3 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v23, v5 +; GFX1170-NEXT: v_mov_b32_e32 v22, v4 +; GFX1170-NEXT: v_mov_b32_e32 v21, v3 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v5 +; GFX1170-NEXT: v_mov_b32_e32 v26, v4 +; GFX1170-NEXT: v_mov_b32_e32 v25, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v7, v[7:8], off @@ -350,6 +558,33 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v7, v[7:8], off +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v5 +; GFX1170-NEXT: v_mov_b32_e32 v18, v4 +; GFX1170-NEXT: v_mov_b32_e32 v17, v3 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v23, v5 +; GFX1170-NEXT: v_mov_b32_e32 v22, v4 +; GFX1170-NEXT: v_mov_b32_e32 v21, v3 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v5 +; GFX1170-NEXT: v_mov_b32_e32 v26, v4 +; GFX1170-NEXT: v_mov_b32_e32 v25, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v7, v[7:8], off @@ -394,6 +629,33 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v7, v[7:8], off +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v5 +; GFX1170-NEXT: v_mov_b32_e32 v18, v4 +; GFX1170-NEXT: v_mov_b32_e32 v17, v3 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v23, v5 +; GFX1170-NEXT: v_mov_b32_e32 v22, v4 +; GFX1170-NEXT: v_mov_b32_e32 v21, v3 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v5 +; GFX1170-NEXT: v_mov_b32_e32 v26, v4 +; GFX1170-NEXT: v_mov_b32_e32 v25, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v7, v[7:8], off @@ -448,3 +710,5 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll index 957b7b1b2c77..bb256883c29a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C) store <4 x float> %res, ptr addrspace(1) %out @@ -14,11 +15,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C) store <4 x float> %res, ptr addrspace(1) %out @@ -26,11 +27,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0) store <4 x half> %res, ptr addrspace(1) %out @@ -38,11 +39,11 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_bf16_16x16x16_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0) store <4 x i16> %res, ptr addrspace(1) %out @@ -50,11 +51,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -62,11 +63,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -74,11 +75,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) store <4 x float> %res, ptr addrspace(1) %out @@ -86,11 +87,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) store <4 x float> %res, ptr addrspace(1) %out @@ -98,11 +99,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) store <4 x float> %res, ptr addrspace(1) %out @@ -110,11 +111,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) store <4 x float> %res, ptr addrspace(1) %out @@ -122,11 +123,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -134,11 +135,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 -; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 +; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index) store <4 x float> %res, ptr addrspace(1) %out @@ -146,11 +147,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 -; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 +; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index) store <4 x float> %res, ptr addrspace(1) %out @@ -158,11 +159,11 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 -; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f16_16x16x32_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 +; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index) store <4 x half> %res, ptr addrspace(1) %out @@ -170,11 +171,11 @@ bb: } define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 -; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 +; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index) store <4 x i16> %res, ptr addrspace(1) %out @@ -182,11 +183,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -194,11 +195,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 -; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 +; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -206,11 +207,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -218,11 +219,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) store <4 x float> %res, ptr addrspace(1) %out @@ -230,11 +231,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) store <4 x float> %res, ptr addrspace(1) %out @@ -242,11 +243,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) store <4 x float> %res, ptr addrspace(1) %out @@ -254,11 +255,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) store <4 x float> %res, ptr addrspace(1) %out @@ -287,3 +288,6 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1170: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll index 4a010071d58c..bc5c3283fb49 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll @@ -1,14 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C) @@ -17,13 +18,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C) @@ -32,13 +33,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C) @@ -47,13 +48,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C) @@ -62,13 +63,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C) @@ -77,13 +78,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C) @@ -92,11 +93,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0) @@ -105,11 +106,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0) @@ -118,11 +119,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x half> %C %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0) @@ -131,11 +132,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0) @@ -144,13 +145,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) @@ -159,13 +160,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) @@ -174,13 +175,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) @@ -189,13 +190,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) @@ -204,13 +205,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) @@ -219,13 +220,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) @@ -234,13 +235,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) @@ -249,13 +250,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) @@ -264,13 +265,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 -; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index) @@ -279,13 +280,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 -; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <16 x half> %B %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index) @@ -294,11 +295,11 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index) @@ -307,11 +308,11 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <16 x half> %B %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index) @@ -322,13 +323,13 @@ bb: ; both neg and abs patterns (wmma matrix C f32 or f16 ) define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %fneg.fabs.C = fneg <8 x float> %fabs.C @@ -338,11 +339,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) %fneg.fabs.C = fneg <8 x half> %fabs.C @@ -352,15 +353,15 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: s_endpgm bb: %el3 = extractelement <8 x float> %C, i32 3 %el3.fabs = call float @llvm.fabs.f32(float %el3) @@ -374,13 +375,13 @@ bb: ; A or B matrix modifier and constant in C define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> ) @@ -389,11 +390,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> , i1 0) @@ -404,6 +405,24 @@ bb: ; pack f16 elements with v_perm_b32 since they don't come from same b32 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: flat_load_b128 v[12:15], v[8:9] offset:16 +; GFX1170-NEXT: flat_load_b128 v[16:19], v[8:9] +; GFX1170-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX1170-NEXT: v_mov_b16_e32 v8.l, v15.l +; GFX1170-NEXT: v_mov_b16_e32 v9.l, v14.l +; GFX1170-NEXT: v_perm_b32 v14, v13, v12, 0x5040100 +; GFX1170-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1170-NEXT: v_perm_b32 v13, v19, v18, 0x5040100 +; GFX1170-NEXT: v_perm_b32 v12, v17, v16, 0x5040100 +; GFX1170-NEXT: v_perm_b32 v15, v8, v9, 0x5040100 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1] +; GFX1170-NEXT: global_store_b128 v[10:11], v[12:15], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll index 1b44e8f01c0f..2558dc390364 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll @@ -1,14 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -16,6 +17,24 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: v_mov_b32_e32 v10, 0x40400000 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_mov_b32_e32 v11, v10 +; GFX1170-NEXT: v_mov_b32_e32 v12, v10 +; GFX1170-NEXT: v_mov_b32_e32 v13, v10 +; GFX1170-NEXT: v_mov_b32_e32 v14, v10 +; GFX1170-NEXT: v_mov_b32_e32 v15, v10 +; GFX1170-NEXT: v_mov_b32_e32 v16, v10 +; GFX1170-NEXT: v_mov_b32_e32 v17, v10 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000 @@ -36,13 +55,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -50,6 +69,24 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: v_mov_b32_e32 v10, 0x40400000 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_mov_b32_e32 v11, v10 +; GFX1170-NEXT: v_mov_b32_e32 v12, v10 +; GFX1170-NEXT: v_mov_b32_e32 v13, v10 +; GFX1170-NEXT: v_mov_b32_e32 v14, v10 +; GFX1170-NEXT: v_mov_b32_e32 v15, v10 +; GFX1170-NEXT: v_mov_b32_e32 v16, v10 +; GFX1170-NEXT: v_mov_b32_e32 v17, v10 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17] +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000 @@ -70,11 +107,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) store <8 x half> %res, ptr addrspace(1) %out @@ -82,6 +119,17 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: v_mov_b32_e32 v10, 0x42004200 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1170-NEXT: v_mov_b32_e32 v11, v10 +; GFX1170-NEXT: v_mov_b32_e32 v12, v10 +; GFX1170-NEXT: v_mov_b32_e32 v13, v10 +; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v10, 0x42004200 @@ -98,6 +146,17 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: v_mov_b32_e32 v10, 0x3f803f80 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1170-NEXT: v_mov_b32_e32 v11, v10 +; GFX1170-NEXT: v_mov_b32_e32 v12, v10 +; GFX1170-NEXT: v_mov_b32_e32 v13, v10 +; GFX1170-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v10, 0x3f803f80 @@ -114,6 +173,17 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1170-NEXT: v_mov_b32_e32 v11, v10 +; GFX1170-NEXT: v_mov_b32_e32 v12, v10 +; GFX1170-NEXT: v_mov_b32_e32 v13, v10 +; GFX1170-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0 @@ -130,13 +200,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -144,6 +214,24 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: v_mov_b32_e32 v6, 0x80 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_mov_b32_e32 v7, v6 +; GFX1170-NEXT: v_mov_b32_e32 v8, v6 +; GFX1170-NEXT: v_mov_b32_e32 v9, v6 +; GFX1170-NEXT: v_mov_b32_e32 v10, v6 +; GFX1170-NEXT: v_mov_b32_e32 v11, v6 +; GFX1170-NEXT: v_mov_b32_e32 v12, v6 +; GFX1170-NEXT: v_mov_b32_e32 v13, v6 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v6, 0x80 @@ -164,13 +252,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -178,6 +266,24 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: v_mov_b32_e32 v4, 0x80 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_mov_b32_e32 v5, v4 +; GFX1170-NEXT: v_mov_b32_e32 v6, v4 +; GFX1170-NEXT: v_mov_b32_e32 v7, v4 +; GFX1170-NEXT: v_mov_b32_e32 v8, v4 +; GFX1170-NEXT: v_mov_b32_e32 v9, v4 +; GFX1170-NEXT: v_mov_b32_e32 v10, v4 +; GFX1170-NEXT: v_mov_b32_e32 v11, v4 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11] +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX1170-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 @@ -198,13 +304,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -212,6 +318,24 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_mov_b32_e32 v7, v6 +; GFX1170-NEXT: v_mov_b32_e32 v8, v6 +; GFX1170-NEXT: v_mov_b32_e32 v9, v6 +; GFX1170-NEXT: v_mov_b32_e32 v10, v6 +; GFX1170-NEXT: v_mov_b32_e32 v11, v6 +; GFX1170-NEXT: v_mov_b32_e32 v12, v6 +; GFX1170-NEXT: v_mov_b32_e32 v13, v6 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 @@ -232,13 +356,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -246,6 +370,24 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_mov_b32_e32 v7, v6 +; GFX1170-NEXT: v_mov_b32_e32 v8, v6 +; GFX1170-NEXT: v_mov_b32_e32 v9, v6 +; GFX1170-NEXT: v_mov_b32_e32 v10, v6 +; GFX1170-NEXT: v_mov_b32_e32 v11, v6 +; GFX1170-NEXT: v_mov_b32_e32 v12, v6 +; GFX1170-NEXT: v_mov_b32_e32 v13, v6 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 @@ -266,13 +408,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -280,6 +422,24 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_mov_b32_e32 v7, v6 +; GFX1170-NEXT: v_mov_b32_e32 v8, v6 +; GFX1170-NEXT: v_mov_b32_e32 v9, v6 +; GFX1170-NEXT: v_mov_b32_e32 v10, v6 +; GFX1170-NEXT: v_mov_b32_e32 v11, v6 +; GFX1170-NEXT: v_mov_b32_e32 v12, v6 +; GFX1170-NEXT: v_mov_b32_e32 v13, v6 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 @@ -300,13 +460,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -314,6 +474,24 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_mov_b32_e32 v7, v6 +; GFX1170-NEXT: v_mov_b32_e32 v8, v6 +; GFX1170-NEXT: v_mov_b32_e32 v9, v6 +; GFX1170-NEXT: v_mov_b32_e32 v10, v6 +; GFX1170-NEXT: v_mov_b32_e32 v11, v6 +; GFX1170-NEXT: v_mov_b32_e32 v12, v6 +; GFX1170-NEXT: v_mov_b32_e32 v13, v6 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 @@ -334,13 +512,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -348,6 +526,24 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { +; GFX1170-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: v_mov_b32_e32 v6, 0x80 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_mov_b32_e32 v7, v6 +; GFX1170-NEXT: v_mov_b32_e32 v8, v6 +; GFX1170-NEXT: v_mov_b32_e32 v9, v6 +; GFX1170-NEXT: v_mov_b32_e32 v10, v6 +; GFX1170-NEXT: v_mov_b32_e32 v11, v6 +; GFX1170-NEXT: v_mov_b32_e32 v12, v6 +; GFX1170-NEXT: v_mov_b32_e32 v13, v6 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v6, 0x80 diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll index 945305848b3e..9d8f26ea11cb 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll @@ -1,14 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -16,13 +17,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -30,13 +31,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) store <8 x i32> %res, ptr addrspace(1) %out @@ -46,13 +47,13 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 -; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -60,13 +61,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 -; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -74,13 +75,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 -; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1) store <8 x i32> %res, ptr addrspace(1) %out @@ -90,13 +91,13 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -104,13 +105,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -118,13 +119,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) store <8 x i32> %res, ptr addrspace(1) %out @@ -136,13 +137,13 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -150,13 +151,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -164,13 +165,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) store <8 x i32> %res, ptr addrspace(1) %out @@ -180,13 +181,13 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -194,13 +195,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -208,13 +209,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) store <8 x i32> %res, ptr addrspace(1) %out @@ -224,13 +225,13 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -238,13 +239,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -252,13 +253,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1) store <8 x i32> %res, ptr addrspace(1) %out @@ -271,3 +272,6 @@ declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 immarg, <2 x declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1170: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll index cd7edc21718c..f7dd2d189a2b 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll @@ -1,7 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v20, v[20:21], off +; GFX1170-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX1170-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX1170-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX1170-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX1170-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v20, v[20:21], off @@ -32,6 +52,25 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v20, v[20:21], off +; GFX1170-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18 +; GFX1170-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16 +; GFX1170-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14 +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 +; GFX1170-NEXT: global_store_b128 v[22:23], v[26:29], off +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 +; GFX1170-NEXT: global_store_b128 v[24:25], v[12:15], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v20, v[20:21], off @@ -62,6 +101,19 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v16, v[16:17], off +; GFX1170-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX1170-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16 +; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX1170-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v16, v[16:17], off @@ -86,6 +138,19 @@ bb: } define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v16, v[16:17], off +; GFX1170-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14 +; GFX1170-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16 +; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[22:25], off +; GFX1170-NEXT: global_store_b128 v[20:21], v[12:15], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v16, v[16:17], off @@ -110,6 +175,25 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v14, v[14:15], off +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14 +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v14, v[14:15], off @@ -140,6 +224,25 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v11, v[11:12], off +; GFX1170-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9 +; GFX1170-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7 +; GFX1170-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5 +; GFX1170-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11 +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16 +; GFX1170-NEXT: global_store_b128 v[13:14], v[17:20], off +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16 +; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v11, v[11:12], off @@ -170,6 +273,25 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v14, v[14:15], off +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v14, v[14:15], off @@ -200,6 +322,25 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v14, v[14:15], off +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v14, v[14:15], off @@ -230,6 +371,25 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v14, v[14:15], off +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v14, v[14:15], off @@ -260,6 +420,25 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v14, v[14:15], off +; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12 +; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10 +; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8 +; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 +; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off +; GFX1170-NEXT: s_clause 0x1 +; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v14, v[14:15], off @@ -299,3 +478,5 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll index d67625248669..0993c00c3041 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll @@ -1,14 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out @@ -16,13 +17,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out @@ -30,11 +31,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0) store <8 x half> %res, ptr addrspace(1) %out @@ -42,11 +43,11 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_bf16_16x16x16_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0) store <8 x i16> %res, ptr addrspace(1) %out @@ -54,13 +55,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -68,13 +69,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 -; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 +; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -82,13 +83,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out @@ -96,13 +97,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out @@ -110,13 +111,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out @@ -124,13 +125,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out @@ -138,13 +139,13 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -153,13 +154,13 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 -; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) store <8 x float> %res, ptr addrspace(1) %out @@ -167,13 +168,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 -; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 +; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) store <8 x float> %res, ptr addrspace(1) %out @@ -181,11 +182,11 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 -; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f16_16x16x32_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) store <8 x half> %res, ptr addrspace(1) %out @@ -193,11 +194,11 @@ bb: } define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 -; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) store <8 x i16> %res, ptr addrspace(1) %out @@ -205,13 +206,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -219,13 +220,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 -; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 +; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -233,13 +234,13 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -247,13 +248,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) store <8 x float> %res, ptr addrspace(1) %out @@ -261,13 +262,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) store <8 x float> %res, ptr addrspace(1) %out @@ -275,13 +276,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) store <8 x float> %res, ptr addrspace(1) %out @@ -289,13 +290,13 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) store <8 x float> %res, ptr addrspace(1) %out @@ -324,3 +325,6 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1170: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll index 53bede84513c..1a2d59e96959 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll @@ -1,13 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GFX12,GFX12-TRUE16 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170-TRUE16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170-FAKE16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12-TRUE16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12-FAKE16 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C) @@ -16,11 +18,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <4 x half> %B %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C) @@ -29,11 +31,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C) @@ -42,11 +44,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C) @@ -55,11 +57,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C) @@ -68,11 +70,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C) @@ -81,11 +83,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0) @@ -94,11 +96,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <4 x half> %B %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0) @@ -107,11 +109,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x half> %C %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0) @@ -120,11 +122,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0) @@ -133,11 +135,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C) @@ -146,11 +148,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C) @@ -159,11 +161,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C) @@ -172,11 +174,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C) @@ -185,11 +187,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C) @@ -198,11 +200,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C) @@ -211,11 +213,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C) @@ -224,11 +226,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C) @@ -237,11 +239,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index) @@ -250,11 +252,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index) @@ -263,11 +265,11 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index) @@ -276,11 +278,11 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index) @@ -291,11 +293,11 @@ bb: ; both neg and abs patterns (wmma matrix C f32 or f16 ) define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %fneg.fabs.C = fneg <4 x float> %fabs.C @@ -305,11 +307,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) %fneg.fabs.C = fneg <4 x half> %fabs.C @@ -319,13 +321,13 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %el3 = extractelement <4 x float> %C, i32 3 %el3.fabs = call float @llvm.fabs.f32(float %el3) @@ -339,11 +341,11 @@ bb: ; A or B matrix modifier and constant in C define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> ) @@ -352,11 +354,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off +; GCN-NEXT: s_endpgm bb: %fneg.B = fneg <4 x half> %B %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> , i1 0) @@ -367,6 +369,29 @@ bb: ; pack f16 elements with v_perm_b32 since they don't come from same b32 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) { +; GFX1170-TRUE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX1170-TRUE16: ; %bb.0: ; %bb +; GFX1170-TRUE16-NEXT: flat_load_b128 v[8:11], v[4:5] +; GFX1170-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1170-TRUE16-NEXT: v_mov_b16_e32 v10.h, v11.l +; GFX1170-TRUE16-NEXT: v_mov_b16_e32 v8.h, v9.l +; GFX1170-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1170-TRUE16-NEXT: v_mov_b32_e32 v9, v10 +; GFX1170-TRUE16-NEXT: v_wmma_f16_16x16x16_f16 v[8:9], v[0:1], v[2:3], v[8:9] neg_lo:[0,0,1] +; GFX1170-TRUE16-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1170-TRUE16-NEXT: s_endpgm +; +; GFX1170-FAKE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: +; GFX1170-FAKE16: ; %bb.0: ; %bb +; GFX1170-FAKE16-NEXT: flat_load_b128 v[8:11], v[4:5] +; GFX1170-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1170-FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 +; GFX1170-FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX1170-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-FAKE16-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +; GFX1170-FAKE16-NEXT: global_store_b64 v[6:7], v[4:5], off +; GFX1170-FAKE16-NEXT: s_endpgm +; ; GFX12-TRUE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: ; GFX12-TRUE16: ; %bb.0: ; %bb ; GFX12-TRUE16-NEXT: flat_load_b128 v[8:11], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll index a8f5726632aa..a4222338a503 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -14,16 +15,16 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v7, v6 -; GFX12-NEXT: v_mov_b32_e32 v8, v6 -; GFX12-NEXT: v_mov_b32_e32 v9, v6 -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9] -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GCN-NEXT: v_mov_b32_e32 v7, v6 +; GCN-NEXT: v_mov_b32_e32 v8, v6 +; GCN-NEXT: v_mov_b32_e32 v9, v6 +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9] +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -31,11 +32,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -43,16 +44,16 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v7, v6 -; GFX12-NEXT: v_mov_b32_e32 v8, v6 -; GFX12-NEXT: v_mov_b32_e32 v9, v6 -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9] -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GCN-NEXT: v_mov_b32_e32 v7, v6 +; GCN-NEXT: v_mov_b32_e32 v8, v6 +; GCN-NEXT: v_mov_b32_e32 v9, v6 +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9] +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -60,11 +61,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 -; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 +; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) store <4 x half> %res, ptr addrspace(1) %out @@ -72,14 +73,14 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v6, 0x42004200 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v7, v6 -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] -; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v6, 0x42004200 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_mov_b32_e32 v7, v6 +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] +; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) store <4 x half> %res, ptr addrspace(1) %out @@ -87,14 +88,14 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v6, 0x3f803f80 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v7, v6 -; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] -; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v6, 0x3f803f80 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_mov_b32_e32 v7, v6 +; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] +; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) store <4 x i16> %res, ptr addrspace(1) %out @@ -102,14 +103,14 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v6, 0x3fc03fc0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v7, v6 -; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] -; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v6, 0x3fc03fc0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_mov_b32_e32 v7, v6 +; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] +; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) store <4 x i16> %res, ptr addrspace(1) %out @@ -117,11 +118,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -129,16 +130,16 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: v_mov_b32_e32 v7, v4 -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v4, 0x80 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GCN-NEXT: v_mov_b32_e32 v5, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NEXT: v_mov_b32_e32 v7, v4 +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -146,11 +147,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -158,16 +159,16 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: v_mov_b32_e32 v7, v4 -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v4, 0x80 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GCN-NEXT: v_mov_b32_e32 v5, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NEXT: v_mov_b32_e32 v7, v4 +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -175,11 +176,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -187,16 +188,16 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: v_mov_b32_e32 v7, v4 -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GCN-NEXT: v_mov_b32_e32 v5, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NEXT: v_mov_b32_e32 v7, v4 +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -204,11 +205,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -216,16 +217,16 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: v_mov_b32_e32 v7, v4 -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GCN-NEXT: v_mov_b32_e32 v5, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NEXT: v_mov_b32_e32 v7, v4 +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -233,11 +234,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -245,16 +246,16 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: v_mov_b32_e32 v7, v4 -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GCN-NEXT: v_mov_b32_e32 v5, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NEXT: v_mov_b32_e32 v7, v4 +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -262,11 +263,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -274,16 +275,16 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: v_mov_b32_e32 v7, v4 -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GCN-NEXT: v_mov_b32_e32 v5, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NEXT: v_mov_b32_e32 v7, v4 +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) store <4 x float> %res, ptr addrspace(1) %out @@ -291,11 +292,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -303,16 +304,16 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: v_mov_b32_e32 v7, v4 -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7] -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v4, 0x80 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GCN-NEXT: v_mov_b32_e32 v5, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NEXT: v_mov_b32_e32 v7, v4 +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7] +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -330,3 +331,6 @@ declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32, i32, < declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32, i32, <4 x float>) declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32, i32, <4 x float>) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1170: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll index 9303dbfad437..baeb81ab6295 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -14,11 +15,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -26,11 +27,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) store <4 x i32> %res, ptr addrspace(1) %out @@ -40,11 +41,11 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -52,11 +53,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -64,11 +65,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) store <4 x i32> %res, ptr addrspace(1) %out @@ -78,11 +79,11 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -90,11 +91,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -102,11 +103,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) store <4 x i32> %res, ptr addrspace(1) %out @@ -114,11 +115,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -126,11 +127,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -138,11 +139,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1) store <4 x i32> %res, ptr addrspace(1) %out @@ -152,11 +153,11 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] -; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] +; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -164,11 +165,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] -; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] +; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -176,11 +177,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp -; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp +; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1) store <4 x i32> %res, ptr addrspace(1) %out @@ -190,11 +191,11 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -202,11 +203,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -214,11 +215,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1) store <4 x i32> %res, ptr addrspace(1) %out @@ -231,3 +232,6 @@ declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 immarg, i32, i declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1170: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll index fdfec74e01b7..183230a1242b 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll @@ -1,7 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v10, v[10:11], off +; GFX1170-NEXT: v_mov_b32_e32 v23, v9 +; GFX1170-NEXT: v_mov_b32_e32 v22, v8 +; GFX1170-NEXT: v_mov_b32_e32 v21, v7 +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v9 +; GFX1170-NEXT: v_mov_b32_e32 v26, v8 +; GFX1170-NEXT: v_mov_b32_e32 v25, v7 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v31, v9 +; GFX1170-NEXT: v_mov_b32_e32 v30, v8 +; GFX1170-NEXT: v_mov_b32_e32 v29, v7 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX1170-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX1170-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX1170-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v10, v[10:11], off @@ -46,6 +74,33 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v10, v[10:11], off +; GFX1170-NEXT: v_mov_b32_e32 v23, v9 +; GFX1170-NEXT: v_mov_b32_e32 v22, v8 +; GFX1170-NEXT: v_mov_b32_e32 v21, v7 +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v9 +; GFX1170-NEXT: v_mov_b32_e32 v26, v8 +; GFX1170-NEXT: v_mov_b32_e32 v25, v7 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v31, v9 +; GFX1170-NEXT: v_mov_b32_e32 v30, v8 +; GFX1170-NEXT: v_mov_b32_e32 v29, v7 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX1170-NEXT: global_store_b128 v[12:13], v[20:23], off +; GFX1170-NEXT: global_store_b128 v[14:15], v[24:27], off +; GFX1170-NEXT: global_store_b128 v[16:17], v[28:31], off +; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v10, v[10:11], off @@ -90,6 +145,27 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v22, v[8:9], off +; GFX1170-NEXT: v_mov_b32_e32 v9, v7 +; GFX1170-NEXT: v_mov_b32_e32 v8, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v7 +; GFX1170-NEXT: v_mov_b32_e32 v18, v6 +; GFX1170-NEXT: v_mov_b32_e32 v21, v7 +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22 +; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX1170-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX1170-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX1170-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX1170-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v22, v[8:9], off @@ -128,6 +204,27 @@ bb: } define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v22, v[8:9], off +; GFX1170-NEXT: v_mov_b32_e32 v9, v7 +; GFX1170-NEXT: v_mov_b32_e32 v8, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v7 +; GFX1170-NEXT: v_mov_b32_e32 v18, v6 +; GFX1170-NEXT: v_mov_b32_e32 v21, v7 +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22 +; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2 +; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX1170-NEXT: global_store_b64 v[10:11], v[8:9], off +; GFX1170-NEXT: global_store_b64 v[12:13], v[18:19], off +; GFX1170-NEXT: global_store_b64 v[14:15], v[20:21], off +; GFX1170-NEXT: global_store_b64 v[16:17], v[6:7], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v22, v[8:9], off @@ -166,6 +263,33 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v7, v[7:8], off +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v5 +; GFX1170-NEXT: v_mov_b32_e32 v18, v4 +; GFX1170-NEXT: v_mov_b32_e32 v17, v3 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v23, v5 +; GFX1170-NEXT: v_mov_b32_e32 v22, v4 +; GFX1170-NEXT: v_mov_b32_e32 v21, v3 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v5 +; GFX1170-NEXT: v_mov_b32_e32 v26, v4 +; GFX1170-NEXT: v_mov_b32_e32 v25, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7 +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v7, v[7:8], off @@ -210,6 +334,21 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v6, v[6:7], off +; GFX1170-NEXT: v_mov_b32_e32 v15, v5 +; GFX1170-NEXT: v_mov_b32_e32 v14, v4 +; GFX1170-NEXT: v_mov_b32_e32 v13, v3 +; GFX1170-NEXT: v_mov_b32_e32 v12, v2 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6 +; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +; GFX1170-NEXT: global_store_b128 v[8:9], v[12:15], off +; GFX1170-NEXT: global_store_b128 v[10:11], v[2:5], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v6, v[6:7], off @@ -236,6 +375,21 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX1170-LABEL: test_swmmac_i32_16x16x64_iu4_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v7, v[7:8], off +; GFX1170-NEXT: v_mov_b32_e32 v16, v6 +; GFX1170-NEXT: v_mov_b32_e32 v15, v5 +; GFX1170-NEXT: v_mov_b32_e32 v14, v4 +; GFX1170-NEXT: v_mov_b32_e32 v13, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7 +; GFX1170-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +; GFX1170-NEXT: global_store_b128 v[9:10], v[13:16], off +; GFX1170-NEXT: global_store_b128 v[11:12], v[3:6], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v7, v[7:8], off @@ -262,6 +416,33 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v7, v[7:8], off +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v5 +; GFX1170-NEXT: v_mov_b32_e32 v18, v4 +; GFX1170-NEXT: v_mov_b32_e32 v17, v3 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v23, v5 +; GFX1170-NEXT: v_mov_b32_e32 v22, v4 +; GFX1170-NEXT: v_mov_b32_e32 v21, v3 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v5 +; GFX1170-NEXT: v_mov_b32_e32 v26, v4 +; GFX1170-NEXT: v_mov_b32_e32 v25, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v7, v[7:8], off @@ -306,6 +487,33 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v7, v[7:8], off +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v5 +; GFX1170-NEXT: v_mov_b32_e32 v18, v4 +; GFX1170-NEXT: v_mov_b32_e32 v17, v3 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v23, v5 +; GFX1170-NEXT: v_mov_b32_e32 v22, v4 +; GFX1170-NEXT: v_mov_b32_e32 v21, v3 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v5 +; GFX1170-NEXT: v_mov_b32_e32 v26, v4 +; GFX1170-NEXT: v_mov_b32_e32 v25, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v7, v[7:8], off @@ -350,6 +558,33 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v7, v[7:8], off +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v5 +; GFX1170-NEXT: v_mov_b32_e32 v18, v4 +; GFX1170-NEXT: v_mov_b32_e32 v17, v3 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v23, v5 +; GFX1170-NEXT: v_mov_b32_e32 v22, v4 +; GFX1170-NEXT: v_mov_b32_e32 v21, v3 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v5 +; GFX1170-NEXT: v_mov_b32_e32 v26, v4 +; GFX1170-NEXT: v_mov_b32_e32 v25, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v7, v[7:8], off @@ -394,6 +629,33 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: +; GFX1170: ; %bb.0: ; %bb +; GFX1170-NEXT: global_load_b32 v7, v[7:8], off +; GFX1170-NEXT: v_mov_b32_e32 v20, v6 +; GFX1170-NEXT: v_mov_b32_e32 v19, v5 +; GFX1170-NEXT: v_mov_b32_e32 v18, v4 +; GFX1170-NEXT: v_mov_b32_e32 v17, v3 +; GFX1170-NEXT: v_mov_b32_e32 v24, v6 +; GFX1170-NEXT: v_mov_b32_e32 v23, v5 +; GFX1170-NEXT: v_mov_b32_e32 v22, v4 +; GFX1170-NEXT: v_mov_b32_e32 v21, v3 +; GFX1170-NEXT: v_mov_b32_e32 v28, v6 +; GFX1170-NEXT: v_mov_b32_e32 v27, v5 +; GFX1170-NEXT: v_mov_b32_e32 v26, v4 +; GFX1170-NEXT: v_mov_b32_e32 v25, v3 +; GFX1170-NEXT: s_waitcnt vmcnt(0) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 +; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off +; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off +; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off +; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off +; GFX1170-NEXT: s_endpgm +; ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: global_load_b32 v7, v[7:8], off @@ -448,3 +710,5 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8( declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8) declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8) declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll index 896efb06d559..60dc7cc766f7 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %C) store <4 x float> %res, ptr addrspace(1) %out @@ -14,11 +15,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] -; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] +; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C) store <4 x float> %res, ptr addrspace(1) %out @@ -26,11 +27,11 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f16_16x16x16_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0) store <4 x half> %res, ptr addrspace(1) %out @@ -38,11 +39,11 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] -; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_bf16_16x16x16_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] +; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0) store <4 x i16> %res, ptr addrspace(1) %out @@ -50,11 +51,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -62,11 +63,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -74,11 +75,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C) store <4 x float> %res, ptr addrspace(1) %out @@ -86,11 +87,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C) store <4 x float> %res, ptr addrspace(1) %out @@ -98,11 +99,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C) store <4 x float> %res, ptr addrspace(1) %out @@ -110,11 +111,11 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C) store <4 x float> %res, ptr addrspace(1) %out @@ -122,11 +123,11 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] -; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] +; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -134,11 +135,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 -; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 +; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index) store <4 x float> %res, ptr addrspace(1) %out @@ -146,11 +147,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 -; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 +; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index) store <4 x float> %res, ptr addrspace(1) %out @@ -158,11 +159,11 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 -; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f16_16x16x32_f16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 +; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index) store <4 x half> %res, ptr addrspace(1) %out @@ -170,11 +171,11 @@ bb: } define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 -; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 +; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index) store <4 x i16> %res, ptr addrspace(1) %out @@ -182,11 +183,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -194,11 +195,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 -; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x32_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 +; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -206,11 +207,11 @@ bb: } define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_i32_16x16x64_iu4: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) store <4 x i32> %res, ptr addrspace(1) %out @@ -218,11 +219,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) store <4 x float> %res, ptr addrspace(1) %out @@ -230,11 +231,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) store <4 x float> %res, ptr addrspace(1) %out @@ -242,11 +243,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) store <4 x float> %res, ptr addrspace(1) %out @@ -254,11 +255,11 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 -; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 +; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off +; GCN-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) store <4 x float> %res, ptr addrspace(1) %out @@ -287,3 +288,6 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8( declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8) declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8) declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1170: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir index ef85de201294..897bd2d8517a 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir +++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX1170 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s # D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0. # $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0 @@ -11,12 +12,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 - ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec ... @@ -27,12 +28,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 - ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec ... @@ -43,11 +44,11 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec ... @@ -58,12 +59,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec ... @@ -73,12 +74,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec ... @@ -89,11 +90,11 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 - ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec ... @@ -104,12 +105,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec ... @@ -120,12 +121,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec ... @@ -136,11 +137,11 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec ... @@ -151,12 +152,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec ... @@ -167,12 +168,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec ... @@ -183,11 +184,11 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 - ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec ... @@ -198,6 +199,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + ; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 + ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 + ; GFX1170-NEXT: {{ $}} + ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX1170-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec + ; ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50 ; GFX12-NEXT: {{ $}} @@ -214,12 +221,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 - ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec ... @@ -230,12 +237,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 - ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec ... @@ -246,11 +253,11 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 - ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec ... @@ -261,6 +268,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX1170-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 + ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 + ; GFX1170-NEXT: {{ $}} + ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX1170-NEXT: early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec + ; ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41 ; GFX12-NEXT: {{ $}} @@ -277,12 +290,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 - ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec ... @@ -293,12 +306,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 - ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec ... @@ -309,11 +322,11 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 - ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec ... @@ -324,6 +337,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 + ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GFX1170-NEXT: {{ $}} + ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GFX1170-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec + ; ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 ; GFX12-NEXT: {{ $}} @@ -340,12 +359,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 - ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir index 4073964e2b03..0a80543b9977 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir +++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX1170 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s # D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0. # $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0 @@ -11,12 +12,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec ... @@ -27,12 +28,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec ... @@ -43,11 +44,11 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec ... @@ -58,12 +59,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec ... @@ -74,12 +75,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec ... @@ -90,11 +91,11 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec ... @@ -105,12 +106,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec ... @@ -121,12 +122,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec ... @@ -137,11 +138,11 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec ... @@ -152,12 +153,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec ... @@ -168,12 +169,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec + ; GCN-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec ... @@ -184,11 +185,11 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 - ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec ... @@ -199,6 +200,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + ; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 + ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 + ; GFX1170-NEXT: {{ $}} + ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX1170-NEXT: early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec + ; ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1 ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28 ; GFX12-NEXT: {{ $}} @@ -215,12 +222,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 - ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec ... @@ -231,12 +238,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 - ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec ... @@ -247,11 +254,11 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec ... @@ -262,6 +269,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24 + ; GFX1170-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 + ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24 + ; GFX1170-NEXT: {{ $}} + ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX1170-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec + ; ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1 ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24 ; GFX12-NEXT: {{ $}} @@ -278,12 +291,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec ... @@ -294,12 +307,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec ... @@ -310,11 +323,11 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec ... @@ -325,6 +338,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 + ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GFX1170-NEXT: {{ $}} + ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GFX1170-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec + ; ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1 ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 ; GFX12-NEXT: {{ $}} @@ -341,12 +360,12 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 - ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec - ; GFX12-NEXT: V_NOP_e32 implicit $exec - ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec + ; GCN-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec ... diff --git a/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s new file mode 100644 index 000000000000..abdb344ac061 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s @@ -0,0 +1,1529 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 -show-encoding %s | FileCheck --check-prefix=GFX1170 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 %s 2>&1 | FileCheck --check-prefix=GFX1170-ERR --implicit-check-not=error: %s + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] +// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] +// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] +// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] +// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] +// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_f16 v[8:15], s[0:3], v[4:7], v[8:15] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7], v[8:15] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], 1.0, v[4:7], v[8:15] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0, v[8:15] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 +// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 +// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3], v[4:7], v[8:15] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7], v[8:15] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], 1.0, v[4:7], v[8:15] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0, v[8:15] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 +// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 +// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] +// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] +// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] +// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] +// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] +// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_f16_16x16x16_f16 v[8:11], s[0:3], v[4:7], v[8:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7], v[8:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], 1.0, v[4:7], v[8:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0, v[8:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 +// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 +// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3], v[4:7], v[8:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7], v[8:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0, v[4:7], v[8:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0, v[8:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] + +v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] + + + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], 1, v[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1, v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 +// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] + +v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] + + + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] +// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp +// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] +// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] +// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:9], s0, v1, v[2:9] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1, v[2:9] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], 1, v1, v[2:9] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1, v[2:9] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 +// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 +// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 +// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 +// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 +// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0, v[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0, v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 +// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 +// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp +// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1], v[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], 1, v[2:3], v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1, v[4:11] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 +// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] + +v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 +// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] + + + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 +// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:2 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:3 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] +// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] +// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] +// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] +// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3], v[4:11], v20 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11], v20 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], 1.0, v[4:11], v20 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0, v20 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:2 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:3 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3], v[4:11], v20 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11], v20 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0, v[4:11], v20 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0, v20 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 +// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:2 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:3 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] +// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] +// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] +// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] +// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3], v[4:11], v16 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11], v16 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], 1.0, v[4:11], v16 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0, v16 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:2 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:3 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3], v[4:11], v16 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11], v16 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0, v[4:11], v16 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0, v16 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 +// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1], v[2:5], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], 1, v[2:5], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1, v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 +// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp +// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 +// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:2 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:3 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] +// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] +// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[3:10], s0, v[1:2], v11 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1], v11 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], 1, v[1:2], v11 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1, v11 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 +// GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1], v[2:5], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], 1, v[2:5], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1, v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 +// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1], v[2:5], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0, v[2:5], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0, v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 +// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1], v[2:5], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0, v[2:5], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0, v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 +// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1], v[2:5], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0, v[2:5], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0, v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 +// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 +// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1], v[2:5], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0, v[2:5], v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0, v14 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s b/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s new file mode 100644 index 000000000000..6b1b889f8bed --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s @@ -0,0 +1,1529 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX1170 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX1170-ERR --implicit-check-not=error: %s + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] +// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] +// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] +// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] +// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] +// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_f16 v[4:7], s[0:1], v[2:3], v[4:7] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[2:3], v[4:7] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[4:7] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], 1.0, v[2:3], v[4:7] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0, v[4:7] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 +// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 +// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1], v[2:3], v[4:7] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[2:3], v[4:7] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[4:7] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], 1.0, v[2:3], v[4:7] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0, v[4:7] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 +// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 +// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] +// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] +// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] +// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] +// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] +// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_f16_16x16x16_f16 v[4:5], s[0:1], v[2:3], v[4:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[2:3], v[4:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[4:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], 1.0, v[2:3], v[4:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0, v[4:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 +// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 +// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] + +v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1], v[2:3], v[4:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[2:3], v[4:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[4:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0, v[2:3], v[4:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0, v[4:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] + +v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 +// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] + + + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] +// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp +// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu8 v[2:5], s0, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, s1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], 1, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 +// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] + +v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 +// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] + + + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] +// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp +// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x16_iu4 v[2:5], s0, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, s1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], 1, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 +// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 +// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] +// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 +// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 +// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] +// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 +// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 +// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] +// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 +// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 +// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] +// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 +// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 +// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] + + + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] +// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp +// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] +// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] +// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_wmma_i32_16x16x32_iu4 v[2:5], s0, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, s1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], 1.0, v1, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1.0, v[2:5] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 +// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] + +v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 +// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] + + + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 +// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 +// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 +// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] +// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] +// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] +// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] +// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1], v[2:5], v10 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3], v10 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s10 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], 1.0, v[2:5], v10 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0, v10 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] +// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1], v[2:5], v10 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3], v10 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s10 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0, v[2:5], v10 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0, v10 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 +// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 +// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 +// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 +// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] +// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] +// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] +// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] +// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1], v[2:5], v8 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3], v8 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s8 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], 1.0, v[2:5], v8 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0, v8 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] +// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand + +v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1], v[2:5], v8 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3], v8 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s8 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0, v[2:5], v8 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0, v8 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 +// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp +// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu8 v[3:6], s0, v[1:2], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], 1, v[1:2], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1, v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 +// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp +// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:2 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:3 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] +// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] +// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x32_iu4 v[2:5], s0, v1, v6 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s1, v6 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s6 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], 1, v1, v6 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1, v6 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 +// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp +// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_i32_16x16x64_iu4 v[3:6], s0, v[1:2], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], 1, v[1:2], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1, v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 +// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0, v[1:2], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0, v[1:2], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0, v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 +// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0, v[1:2], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0, v[1:2], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0, v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 +// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0, v[1:2], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0, v[1:2], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0, v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + + + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 +// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 clamp +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 +// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 +// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1] +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0, v[1:2], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0, v[1:2], v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0, v7 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1 +// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s index 363db1a16b17..a96e9c4c0787 100644 --- a/llvm/test/MC/AMDGPU/literals.s +++ b/llvm/test/MC/AMDGPU/literals.s @@ -206,14 +206,14 @@ v_fract_f64_e32 v[0:1], lit(1.0) v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0 // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0xca,0x1b] // NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU // NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1.0) // NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction // NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction -// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU // NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU @@ -658,14 +658,14 @@ v_fract_f64_e32 v[0:1], 0xffffffffffffffff v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a] // NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU // NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1) // NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction // NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction -// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU // NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt new file mode 100644 index 000000000000..1e778fb04aea --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt @@ -0,0 +1,1628 @@ +# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1170 -show-encoding %s | FileCheck --check-prefix=GFX1170 %s +# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1170 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX1170-ERR %s + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c] +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x40,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x58,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x40,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x41,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x41,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c] +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x42,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x42,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0xc0,0x43,0xcc,0x00,0x09,0x22,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x48,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x50,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x60,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x00,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] + +[0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18] # sgpr src2 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18] + +[0x08,0x40,0x43,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x01,0x20,0x1c] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] + +[0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] + + + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c] +# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x48,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x41,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x44,0xcc,0x81,0x04,0x12,0x1c] # 1 src0 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x01,0x04,0x12,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x03,0x11,0x1c] # 1 src1 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x03,0x10,0x1c] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] + +[0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] + + + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr src0 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] # sgpr src1 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1/*Invalid register, operand has 'VGPR_32' register class*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x18] # sgpr src2 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], 1/*Invalid immediate*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1/*Invalid immediate*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c] +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x46,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x46,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c] +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x48,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x48,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c] +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x47,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x47,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c] +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x49,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x49,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c] +# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x41,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x42,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x44,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18] # sgpr src2 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18] + +[0x04,0x40,0x4a,0xcc,0x81,0x04,0x12,0x1c] # 1 src0 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x01,0x04,0x12,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x03,0x11,0x1c] # 1 src1 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x03,0x10,0x1c] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] + +[0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] + + + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c] +# GFX1170:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0xc0,0x50,0xcc,0x00,0x09,0x52,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0] +# GFX1170:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:1 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x44,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18] # sgpr src2 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18] + +[0x0c,0x40,0x50,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x01,0x50,0x1c] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x18] + + + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c] +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0xc0,0x51,0xcc,0x00,0x09,0x52,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:1 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] + +[0x0c,0x44,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18] # sgpr src2 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18] + +[0x0c,0x40,0x51,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x01,0x50,0x1c] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x18] + + + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c] +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0xc0,0x52,0xcc,0x00,0x09,0x42,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:1 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x44,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18] # sgpr src2 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18] + +[0x0c,0x40,0x52,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x01,0x40,0x1c] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x18] + + + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c] +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0xc0,0x53,0xcc,0x00,0x09,0x42,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x60,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x00,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:1 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x58,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] + +[0x0c,0x44,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18] # sgpr src2 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18] + +[0x0c,0x40,0x53,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x01,0x40,0x1c] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x02,0x18] + +[0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x18] + + + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c] +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x54,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0 +# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x01,0x04,0x3a,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x03,0x39,0x1c] # 1 src1 +# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x03,0x38,0x1c] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x18] + +[0x06,0x40,0x54,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x02,0x18] + + + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c] +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] # clamp +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x60,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:1 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] + +[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x58,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c] # sgpr src0 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c] # sgpr src1 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x01,0x2c,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18] # sgpr src2 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18] + +[0x03,0x40,0x55,0xcc,0x81,0x02,0x2e,0x1c] # 1 src0 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], 1/*Invalid immediate*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x01,0x02,0x2e,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x2d,0x1c] # 1 src1 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1/*Invalid immediate*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x18] + +[0x03,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] + + + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c] +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x56,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x01,0x04,0x3a,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x03,0x39,0x1c] # 1 src1 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x03,0x38,0x1c] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x18] + +[0x06,0x40,0x56,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x02,0x18] + + + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c] +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x57,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x57,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c] +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x58,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x58,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c] +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x59,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x59,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c] +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0xc0,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x60,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] + +[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x58,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x42,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x44,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18] + +[0x06,0x40,0x5a,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x18] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt new file mode 100644 index 000000000000..169fd20488e3 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt @@ -0,0 +1,1628 @@ +# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX1170 %s +# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX1170-ERR %s + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x40,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x40,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x41,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x41,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c] +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x42,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x42,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0xc0,0x43,0xcc,0x00,0x05,0x12,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x60,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x00,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] + +[0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18] + +[0x04,0x40,0x43,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] + +[0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] + + + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c] +# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x44,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c] +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x46,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x46,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c] +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x47,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x47,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c] +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x48,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x48,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c] +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x49,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] + +[0x02,0x41,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x49,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] + +[0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] + + + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c] +# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # clamp +# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] + +[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x4a,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x01,0x02,0x0a,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x09,0x1c] # 1 src1 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x08,0x1c] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] + +[0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] + + + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c] +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0xc0,0x50,0xcc,0x00,0x05,0x2a,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x60,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x44,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x50,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c] +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0xc0,0x51,0xcc,0x00,0x05,0x2a,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x60,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] + +[0x06,0x44,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x51,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c] +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0xc0,0x52,0xcc,0x00,0x05,0x22,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0] +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x60,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:1 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:2 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:3 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x44,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x52,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x18] + + + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c] +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0xc0,0x53,0xcc,0x00,0x05,0x22,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0] +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x60,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x00,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:1 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:2 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:3 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0] +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0] +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] + +[0x06,0x44,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x53,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] + +[0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x18] + + + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c] +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x54,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0 +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x01,0x02,0x1e,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1 +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1c,0x1c] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x18] + +[0x03,0x40,0x54,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] + + + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c] +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] # clamp +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x60,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x00,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:1 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] + +[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x58,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x41,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x42,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x44,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c] # sgpr_0 src0 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c] # sgpr_0 src1 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] + +[0x02,0x40,0x55,0xcc,0x81,0x02,0x1a,0x1c] # 1 src0 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x01,0x02,0x1a,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x19,0x1c] # 1 src1 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x18,0x1c] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x18] + +[0x02,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] + + + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c] +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x60,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x58,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x56,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x01,0x02,0x1e,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1c,0x1c] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x18] + +[0x03,0x40,0x56,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] + + + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c] +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x57,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x57,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x18] + + + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c] +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x58,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x58,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x18] + + + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c] +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x59,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x59,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x18] + + + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c] +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0xc0,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # clamp +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0] +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0] +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x60,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x00,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x41,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x42,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x44,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1] +# GFX1170-ERR: warning: invalid instruction encoding + +[0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x5a,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] + +[0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2 +# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x18]