[AMDGPU] Add WMMA and SWMMAC instructions for gfx1170 (#180731)
Introduce two new subtarget features: - WMMA256bInsts for GFX11 WMMA instructions and - WMMA128bInsts for GFX1170 and GFX12 WMMA and SWMMAC instructions Some WMMA instructions have changed from GFX 11.0 to GFX 11.7 so new Real versions were added with "_gfx1170" suffix. For consistency all WMMA and SWMMAC GFX11.7 instructions use this suffix. To resolve decoding issues between different formats for some WMMA instructions between GFX 11 and GFX 11.7, new decoding tables were added.
This commit is contained in:
parent
df1eec77b5
commit
829afc4c91
@ -358,23 +358,23 @@ def __builtin_amdgcn_s_wait_event : AMDGPUBuiltin<"void(_Constant short)", [], "
|
||||
// Postfix w32 indicates the builtin requires wavefront size of 32.
|
||||
// Postfix w64 indicates the builtin requires wavefront size of 64.
|
||||
//===----------------------------------------------------------------------===//
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, float>)", [Const], "gfx11-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, float>)", [Const], "gfx11-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, float>)", [Const], "wmma-256b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, float>)", [Const], "wmma-256b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
|
||||
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<4, float>)", [Const], "gfx11-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<4, float>)", [Const], "gfx11-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<4, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<4, float>)", [Const], "wmma-256b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<4, float>)", [Const], "wmma-256b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
|
||||
|
||||
def __builtin_amdgcn_s_sendmsg_rtn : AMDGPUBuiltin<"unsigned int(_Constant unsigned int)", [], "gfx11-insts">;
|
||||
def __builtin_amdgcn_s_sendmsg_rtnl : AMDGPUBuiltin<"uint64_t(_Constant unsigned int)", [], "gfx11-insts">;
|
||||
@ -599,67 +599,71 @@ def __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn : AMDGPUBuiltin<"_ExtVector<2,
|
||||
// The second return value of the intrinsic is zext'ed.
|
||||
def __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn : AMDGPUBuiltin<"_ExtVector<2, uint64_t>(unsigned int, unsigned int, _ExtVector<8, unsigned int>, _Constant int)", [], "gfx12-insts">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// GFX1170, GFX12+ only builtins.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// WMMA builtins.
|
||||
// Postfix w32 indicates the builtin requires wavefront size of 32.
|
||||
// Postfix w64 indicates the builtin requires wavefront size of 64.
|
||||
//
|
||||
// Some of these are very similar to their GFX11 counterparts, but they don't
|
||||
// require replication of the A,B matrices, so they use fewer vector elements.
|
||||
// Therefore, we add an "_gfx12" suffix to distinguish them from the existing
|
||||
// builtins.
|
||||
// Some of these are very similar to their base GFX11 counterparts, but they
|
||||
// don't require replication of the A,B matrices, so they use fewer vector
|
||||
// elements. Therefore, we add an "_gfx12" suffix to distinguish them from the
|
||||
// existing builtins.
|
||||
//===----------------------------------------------------------------------===//
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, _Float16>)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, short>)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, int, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
// These are gfx12-only, but for consistency with the other WMMA variants we're
|
||||
// keeping the "_gfx12" suffix.
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, _Float16>)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, short>)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, int, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
// These are gfx1170 and gfx12 only, but for consistency with the other WMMA
|
||||
// variants we're keeping the "_gfx12" suffix.
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, _Float16>)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, short>)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
// These are gfx12-only, but for consistency with the other WMMA variants we're
|
||||
// keeping the "_gfx12" suffix.
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, _Float16>)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, short>)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
// These are gfx1170 and gfx12 only, but for consistency with the other WMMA
|
||||
// variants we're keeping the "_gfx12" suffix.
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, __fp16>, int)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, short>, int)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, __fp16>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, short>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
|
||||
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, __fp16>, int)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, short>, int)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, __fp16>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, short>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
|
||||
|
||||
def __builtin_amdgcn_prng_b32 : AMDGPUBuiltin<"unsigned int(unsigned int)", [Const], "prng-inst">;
|
||||
def __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, _Float16>, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
|
||||
// REQUIRES: amdgpu-registered-target
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1170 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s
|
||||
|
||||
typedef int v2i __attribute__((ext_vector_type(2)));
|
||||
typedef float v8f __attribute__((ext_vector_type(8)));
|
||||
@ -14,12 +15,12 @@ typedef int v8i __attribute__((ext_vector_type(8)));
|
||||
// amdgcn_wmma_f32_16x16x16_f16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x float> [[C]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8:![0-9]+]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x float> [[C]])
|
||||
// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8:![0-9]+]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
|
||||
{
|
||||
@ -30,12 +31,12 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
|
||||
// amdgcn_wmma_f32_16x16x16_bf16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x float> [[C]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x float> [[C]])
|
||||
// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
|
||||
{
|
||||
@ -46,12 +47,12 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c
|
||||
// amdgcn_wmma_f16_16x16x16_f16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], i1 false)
|
||||
// CHECK-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
|
||||
{
|
||||
@ -62,12 +63,12 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
|
||||
// amdgcn_wmma_bf16_16x16x16_bf16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], i1 false)
|
||||
// CHECK-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
|
||||
{
|
||||
@ -78,12 +79,12 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s
|
||||
// amdgcn_wmma_i32_16x16x16_iu8
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
|
||||
// CHECK-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
|
||||
{
|
||||
@ -94,79 +95,79 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
|
||||
// amdgcn_wmma_i32_16x16x16_iu4
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <8 x i32> [[C]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <8 x i32> [[C]], i1 false)
|
||||
// CHECK-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
|
||||
// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
|
||||
// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
|
||||
// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
|
||||
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
|
||||
// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
|
||||
// CHECK-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
|
||||
}
|
||||
//.
|
||||
// CHECK-GFX1200: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
|
||||
// CHECK-GFX1200: [[META7]] = !{!"Simple C/C++ TBAA"}
|
||||
// CHECK-GFX1200: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
|
||||
// CHECK: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
|
||||
// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
|
||||
// CHECK: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
|
||||
//.
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
|
||||
// REQUIRES: amdgpu-registered-target
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1170 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s
|
||||
|
||||
typedef float v4f __attribute__((ext_vector_type(4)));
|
||||
typedef half v4h __attribute__((ext_vector_type(4)));
|
||||
@ -13,12 +14,12 @@ typedef int v4i __attribute__((ext_vector_type(4)));
|
||||
// amdgcn_wmma_f32_16x16x16_f16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w64(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x float> [[C]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w64(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x float> [[C]])
|
||||
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
|
||||
{
|
||||
@ -29,12 +30,12 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
|
||||
// amdgcn_wmma_f32_16x16x16_bf16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x float> [[C]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x float> [[C]])
|
||||
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
|
||||
{
|
||||
@ -45,12 +46,12 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c
|
||||
// amdgcn_wmma_f16_16x16x16_f16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w64(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w64(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], i1 false)
|
||||
// CHECK-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
|
||||
{
|
||||
@ -61,12 +62,12 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
|
||||
// amdgcn_wmma_bf16_16x16x16_bf16
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], i1 false)
|
||||
// CHECK-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
|
||||
{
|
||||
@ -77,12 +78,12 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s
|
||||
// amdgcn_wmma_i32_16x16x16_iu8
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
|
||||
// CHECK-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
|
||||
{
|
||||
@ -93,79 +94,79 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
|
||||
// amdgcn_wmma_i32_16x16x16_iu4
|
||||
//
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
|
||||
// CHECK-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
|
||||
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
|
||||
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
|
||||
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
|
||||
// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
|
||||
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
|
||||
}
|
||||
|
||||
// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
|
||||
// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
|
||||
// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-GFX1200-NEXT: ret void
|
||||
// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
|
||||
// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
||||
// CHECK-NEXT: [[ENTRY:.*:]]
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
|
||||
// CHECK-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
|
||||
{
|
||||
*out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
|
||||
}
|
||||
//.
|
||||
// CHECK-GFX1200: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
|
||||
// CHECK-GFX1200: [[META7]] = !{!"Simple C/C++ TBAA"}
|
||||
// CHECK-GFX1200: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
|
||||
// CHECK: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
|
||||
// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
|
||||
// CHECK: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
|
||||
//.
|
||||
|
||||
@ -21,14 +21,14 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b
|
||||
global v16s* out16s, v2i a2i, v2i b2i, v16s c16s,
|
||||
global v8i* out8i, v4i a4i, v4i b4i, v8i c8i)
|
||||
{
|
||||
*out8f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a16h, b16h, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w32' needs target feature gfx11-insts,wavefrontsize32}}
|
||||
*out8f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a16s, b16s, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32' needs target feature gfx11-insts,wavefrontsize32}}
|
||||
*out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w32' needs target feature gfx11-insts,wavefrontsize32}}
|
||||
*out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32' needs target feature gfx11-insts,wavefrontsize32}}
|
||||
*out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32' needs target feature gfx11-insts,wavefrontsize32}}
|
||||
*out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32' needs target feature gfx11-insts,wavefrontsize32}}
|
||||
*out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a4i, true, b4i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32' needs target feature gfx11-insts,wavefrontsize32}}
|
||||
*out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32(true, a2i, true, b2i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32' needs target feature gfx11-insts,wavefrontsize32}}
|
||||
*out8f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a16h, b16h, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
|
||||
*out8f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a16s, b16s, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
|
||||
*out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
|
||||
*out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
|
||||
*out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
|
||||
*out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
|
||||
*out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a4i, true, b4i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
|
||||
*out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32(true, a2i, true, b2i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@ -21,14 +21,14 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out4f, v16h a16h, v16h b
|
||||
global v8s* out8s, v4i a4i, v4i b4i, v8s c8s,
|
||||
global v4i* out4i, v2i a2i, v2i b2i, v4i c4i)
|
||||
{
|
||||
*out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target feature gfx11-insts,wavefrontsize64}}
|
||||
*out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target feature gfx11-insts,wavefrontsize64}}
|
||||
*out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target feature gfx11-insts,wavefrontsize64}}
|
||||
*out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target feature gfx11-insts,wavefrontsize64}}
|
||||
*out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' needs target feature gfx11-insts,wavefrontsize64}}
|
||||
*out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' needs target feature gfx11-insts,wavefrontsize64}}
|
||||
*out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' needs target feature gfx11-insts,wavefrontsize64}}
|
||||
*out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' needs target feature gfx11-insts,wavefrontsize64}}
|
||||
*out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
|
||||
*out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
|
||||
*out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
|
||||
*out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
|
||||
*out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
|
||||
*out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
|
||||
*out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
|
||||
*out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@ -775,6 +775,14 @@ defm CvtFP8VOP1Bug : AMDGPUSubtargetFeature<"cvt-fp8-vop1-bug",
|
||||
[FeatureFP8ConversionInsts]
|
||||
>;
|
||||
|
||||
defm WMMA256bInsts : AMDGPUSubtargetFeature<"wmma-256b-insts",
|
||||
"Has WMMA instructions where A and B matrices have duplicated data"
|
||||
>;
|
||||
|
||||
defm WMMA128bInsts : AMDGPUSubtargetFeature<"wmma-128b-insts",
|
||||
"Has WMMA instructions where A and B matrices do not have duplicated data"
|
||||
>;
|
||||
|
||||
defm PkFmacF16Inst : AMDGPUSubtargetFeature<"pk-fmac-f16-inst",
|
||||
"Has v_pk_fmac_f16 instruction"
|
||||
>;
|
||||
@ -1820,9 +1828,9 @@ def FeatureISAVersion11_Common : FeatureSet<
|
||||
FeatureD16Writes32BitVgpr,
|
||||
]>;
|
||||
|
||||
// There are few workarounds that need to be
|
||||
// added to all targets. This pessimizes codegen
|
||||
// a bit on the generic GFX11 target.
|
||||
// There are few workarounds that need to be added to all targets. This
|
||||
// pessimizes codegen a bit on the generic GFX11 target. This generic target
|
||||
// does not include GFX1170 due to incompatible changes.
|
||||
def FeatureISAVersion11_Generic: FeatureSet<
|
||||
!listconcat(FeatureISAVersion11_Common.Features,
|
||||
[FeatureMSAALoadDstSelBug,
|
||||
@ -1831,14 +1839,16 @@ def FeatureISAVersion11_Generic: FeatureSet<
|
||||
FeatureMADIntraFwdBug,
|
||||
FeaturePrivEnabledTrap2NopBug,
|
||||
FeatureRequiresCOV6,
|
||||
FeatureRequiredExportPriority])>;
|
||||
FeatureRequiredExportPriority,
|
||||
FeatureWMMA256bInsts])>;
|
||||
|
||||
def FeatureISAVersion11_0_Common : FeatureSet<
|
||||
!listconcat(FeatureISAVersion11_Common.Features,
|
||||
[FeatureMSAALoadDstSelBug,
|
||||
FeatureVALUTransUseHazard,
|
||||
FeatureMADIntraFwdBug,
|
||||
FeaturePrivEnabledTrap2NopBug])>;
|
||||
FeaturePrivEnabledTrap2NopBug,
|
||||
FeatureWMMA256bInsts])>;
|
||||
|
||||
def FeatureISAVersion11_0_0 : FeatureSet<
|
||||
!listconcat(FeatureISAVersion11_0_Common.Features,
|
||||
@ -1861,7 +1871,8 @@ def FeatureISAVersion11_5_Common : FeatureSet<
|
||||
!listconcat(FeatureISAVersion11_Common.Features,
|
||||
[FeatureSALUFloatInsts,
|
||||
FeatureDPPSrc1SGPR,
|
||||
FeatureRequiredExportPriority])>;
|
||||
FeatureRequiredExportPriority,
|
||||
FeatureWMMA256bInsts])>;
|
||||
|
||||
def FeatureISAVersion11_5_0 : FeatureSet<
|
||||
!listconcat(FeatureISAVersion11_5_Common.Features,
|
||||
@ -1885,7 +1896,8 @@ def FeatureISAVersion11_7_0 : FeatureSet<
|
||||
[FeatureSALUFloatInsts,
|
||||
FeatureDPPSrc1SGPR,
|
||||
FeatureFP8ConversionInsts,
|
||||
FeatureDot11Insts])>;
|
||||
FeatureDot11Insts,
|
||||
FeatureWMMA128bInsts])>;
|
||||
|
||||
def FeatureISAVersion12 : FeatureSet<
|
||||
[FeatureGFX12,
|
||||
@ -1915,6 +1927,7 @@ def FeatureISAVersion12 : FeatureSet<
|
||||
FeatureImageInsts,
|
||||
FeatureExtendedImageInsts,
|
||||
FeatureFP8ConversionInsts,
|
||||
FeatureWMMA128bInsts,
|
||||
FeatureIEEEMinimumMaximumInsts,
|
||||
FeaturePackedTID,
|
||||
FeatureVcmpxPermlaneHazard,
|
||||
|
||||
@ -1556,6 +1556,8 @@ public:
|
||||
return AMDGPU::isGFX11Plus(getSTI());
|
||||
}
|
||||
|
||||
bool isGFX1170() const { return AMDGPU::isGFX1170(getSTI()); }
|
||||
|
||||
bool isGFX12() const { return AMDGPU::isGFX12(getSTI()); }
|
||||
|
||||
bool isGFX12Plus() const { return AMDGPU::isGFX12Plus(getSTI()); }
|
||||
|
||||
@ -686,11 +686,19 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
|
||||
Address, CS))
|
||||
break;
|
||||
|
||||
if (isGFX1170() &&
|
||||
tryDecodeInst(DecoderTableGFX117064, MI, QW, Address, CS))
|
||||
break;
|
||||
|
||||
if (isGFX11() &&
|
||||
tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
|
||||
Address, CS))
|
||||
break;
|
||||
|
||||
if (isGFX1170() &&
|
||||
tryDecodeInst(DecoderTableGFX1170W6464, MI, QW, Address, CS))
|
||||
break;
|
||||
|
||||
if (isGFX11() &&
|
||||
tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
|
||||
break;
|
||||
@ -2247,6 +2255,8 @@ bool AMDGPUDisassembler::isGFX11Plus() const {
|
||||
return AMDGPU::isGFX11Plus(STI);
|
||||
}
|
||||
|
||||
bool AMDGPUDisassembler::isGFX1170() const { return AMDGPU::isGFX1170(STI); }
|
||||
|
||||
bool AMDGPUDisassembler::isGFX12() const {
|
||||
return STI.hasFeature(AMDGPU::FeatureGFX12);
|
||||
}
|
||||
|
||||
@ -178,6 +178,7 @@ public:
|
||||
bool isGFX10() const;
|
||||
bool isGFX10Plus() const;
|
||||
bool isGFX11() const;
|
||||
bool isGFX1170() const;
|
||||
bool isGFX11Plus() const;
|
||||
bool isGFX12() const;
|
||||
bool isGFX12Plus() const;
|
||||
|
||||
@ -396,6 +396,10 @@ public:
|
||||
return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
|
||||
}
|
||||
|
||||
bool isGFX1170() const {
|
||||
return getGeneration() == GFX11 && hasWMMA128bInsts();
|
||||
}
|
||||
|
||||
bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
|
||||
|
||||
bool hasAtomicFaddInsts() const {
|
||||
|
||||
@ -44,9 +44,10 @@ class GFXGen<Predicate pred, string dn, string suffix, int sub> {
|
||||
def GFX13Gen : GFXGen<isGFX13Only, "GFX13", "_gfx13", SIEncodingFamily.GFX13>;
|
||||
def GFX1250Gen : GFXGen<isGFX125xOnly, "GFX1250", "_gfx1250", SIEncodingFamily.GFX1250>;
|
||||
def GFX12Not12_50Gen : GFXGen<isGFX12Not12_50, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
|
||||
def GFX12Gen : GFXGen<isGFX12Only, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
|
||||
def GFX11Gen : GFXGen<isGFX11Only, "GFX11", "_gfx11", SIEncodingFamily.GFX11>;
|
||||
def GFX10Gen : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.GFX10>;
|
||||
def GFX12Gen : GFXGen<isGFX12Only, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
|
||||
def GFX1170Gen : GFXGen<isGFX11Only, "GFX1170", "_gfx1170", SIEncodingFamily.GFX11>;
|
||||
def GFX11Gen : GFXGen<isGFX11Only, "GFX11", "_gfx11", SIEncodingFamily.GFX11>;
|
||||
def GFX10Gen : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.GFX10>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SI DAG Nodes
|
||||
|
||||
@ -2598,6 +2598,10 @@ bool isGFX11(const MCSubtargetInfo &STI) {
|
||||
return STI.hasFeature(AMDGPU::FeatureGFX11);
|
||||
}
|
||||
|
||||
bool isGFX1170(const MCSubtargetInfo &STI) {
|
||||
return isGFX11(STI) && STI.hasFeature(AMDGPU::FeatureWMMA128bInsts);
|
||||
}
|
||||
|
||||
bool isGFX11Plus(const MCSubtargetInfo &STI) {
|
||||
return isGFX11(STI) || isGFX12Plus(STI);
|
||||
}
|
||||
|
||||
@ -1705,6 +1705,7 @@ bool isGFX10Plus(const MCSubtargetInfo &STI);
|
||||
bool isNotGFX10Plus(const MCSubtargetInfo &STI);
|
||||
bool isGFX10Before1030(const MCSubtargetInfo &STI);
|
||||
bool isGFX11(const MCSubtargetInfo &STI);
|
||||
bool isGFX1170(const MCSubtargetInfo &STI);
|
||||
bool isGFX11Plus(const MCSubtargetInfo &STI);
|
||||
bool isGFX12(const MCSubtargetInfo &STI);
|
||||
bool isGFX12Plus(const MCSubtargetInfo &STI);
|
||||
|
||||
@ -1426,22 +1426,18 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
|
||||
defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
|
||||
|
||||
defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
|
||||
let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
|
||||
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
|
||||
def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
|
||||
}
|
||||
}
|
||||
if convertibleTo3Addr then {
|
||||
|
||||
let SubtargetPredicate = HasWMMA256bInsts in {
|
||||
let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
|
||||
let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
|
||||
def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
|
||||
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
|
||||
def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
|
||||
}
|
||||
if convertibleTo3Addr then {
|
||||
let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
|
||||
def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
|
||||
}
|
||||
}
|
||||
}
|
||||
def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
|
||||
!cast<Instruction>(NAME # _threeaddr # Suffix)>;
|
||||
}
|
||||
|
||||
let SubtargetPredicate = isGFX11Only in {
|
||||
if !eq(Type, WMMAOpSel) then {
|
||||
def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
|
||||
} else if !eq(Type, WMMAUIClamp) then {
|
||||
@ -1450,6 +1446,11 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
|
||||
def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
|
||||
}
|
||||
}
|
||||
|
||||
if convertibleTo3Addr then {
|
||||
def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
|
||||
!cast<Instruction>(NAME # _threeaddr # Suffix)>;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1727,7 +1728,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
|
||||
defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
|
||||
defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
|
||||
|
||||
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in {
|
||||
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1, SubtargetPredicate = HasWMMA128bInsts in {
|
||||
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
|
||||
def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
|
||||
let PseudoInstr = Instr#PseudoInstrSuffix;
|
||||
@ -2047,7 +2048,7 @@ class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile
|
||||
let WaveSizePredicate = isWave64;
|
||||
}
|
||||
|
||||
let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12PlusNot12_50 in {
|
||||
let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX11PlusNot12_50, OtherPredicates = [HasWMMA128bInsts] in {
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>;
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>;
|
||||
defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>;
|
||||
@ -2074,7 +2075,7 @@ let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12PlusNot12_50 in {
|
||||
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>;
|
||||
}
|
||||
|
||||
let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
|
||||
let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX11PlusNot12_50, OtherPredicates = [HasWMMA128bInsts] in {
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>;
|
||||
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>;
|
||||
defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>;
|
||||
@ -2229,6 +2230,18 @@ multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<8> op, VOP3PWMMA_Profile WMMAP,
|
||||
VOP3PeWmma<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl, WMMAP>;
|
||||
}
|
||||
|
||||
multiclass VOP3P_Real_WMMA_gfx1170 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
|
||||
let WaveSizePredicate = isWave32, DecoderNamespace = "GFX1170" in {
|
||||
defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1170Gen, op, WMMAP>;
|
||||
}
|
||||
}
|
||||
|
||||
multiclass VOP3P_Real_WMMA_gfx1170w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
|
||||
let WaveSizePredicate = isWave64, DecoderNamespace = "GFX1170W64" in {
|
||||
defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1170Gen, op, WMMAP>;
|
||||
}
|
||||
}
|
||||
|
||||
multiclass VOP3P_Real_WMMA_gfx12 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
|
||||
let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
|
||||
defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
|
||||
@ -2241,6 +2254,14 @@ multiclass VOP3P_Real_WMMA_gfx12w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
|
||||
}
|
||||
}
|
||||
|
||||
multiclass VOP3P_Real_WMMA_gfx1170_gfx12 <bits<8> op, VOP3PWMMA_Profile WMMAP> :
|
||||
VOP3P_Real_WMMA_gfx1170<op, WMMAP>,
|
||||
VOP3P_Real_WMMA_gfx12<op, WMMAP>;
|
||||
|
||||
multiclass VOP3P_Real_WMMA_gfx1170_gfx12w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> :
|
||||
VOP3P_Real_WMMA_gfx1170w64<op, WMMAP>,
|
||||
VOP3P_Real_WMMA_gfx12w64<op, WMMAP>;
|
||||
|
||||
multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
|
||||
let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
|
||||
defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1250Gen, op, WMMAP>;
|
||||
@ -2345,54 +2366,53 @@ multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats<string Gen, bits<8> op, bits<8> Ld
|
||||
}
|
||||
}
|
||||
|
||||
defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
|
||||
defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
|
||||
defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>;
|
||||
defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>;
|
||||
defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>;
|
||||
defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x040, F32_F16_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x041, F32_BF16_WMMA_w32>;
|
||||
defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x042, F16_F16_WMMA_w32>;
|
||||
defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x043, BF16_BF16_WMMA_w32>;
|
||||
defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x044, I32_IU8_WMMA_w32>;
|
||||
defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x045, I32_IU4X16_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x046, F32_FP8BF8_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x047, F32_FP8BF8_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x048, F32_FP8BF8_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x049, F32_FP8BF8_WMMA_w32>;
|
||||
defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x04a, I32_IU4X32_WMMA_w32>;
|
||||
|
||||
defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>;
|
||||
defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>;
|
||||
defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>;
|
||||
defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>;
|
||||
defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>;
|
||||
defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x040, F32_F16_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x041, F32_BF16_WMMA_w64>;
|
||||
defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x042, F16_F16_WMMA_w64>;
|
||||
defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x043, BF16_BF16_WMMA_w64>;
|
||||
defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x044, I32_IU8_WMMA_w64>;
|
||||
defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>;
|
||||
defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>;
|
||||
defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>;
|
||||
|
||||
defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x050, F32_F16_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x051, F32_BF16_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x052, F16_F16_SWMMAC_w32>;
|
||||
defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x053, BF16_BF16_SWMMAC_w32>;
|
||||
defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x054, I32_IU8_SWMMAC_w32>;
|
||||
defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>;
|
||||
defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>;
|
||||
|
||||
defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>;
|
||||
defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>;
|
||||
defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>;
|
||||
defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>;
|
||||
defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>;
|
||||
|
||||
defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>;
|
||||
defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>;
|
||||
defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>;
|
||||
defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>;
|
||||
defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x050, F32_F16_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x052, F16_F16_SWMMAC_w64>;
|
||||
defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>;
|
||||
defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>;
|
||||
defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>;
|
||||
defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
|
||||
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
|
||||
|
||||
defm V_WMMA_F32_16X16X4_F32_w32 : VOP3P_Real_WMMA_gfx1250 <0x05d, F32_F32_WMMA_w32>;
|
||||
defm V_WMMA_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x062, F32_BF16X32_WMMA_w32>;
|
||||
|
||||
@ -515,13 +515,38 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
|
||||
Features["qsad-insts"] = true;
|
||||
Features["cvt-pknorm-vop2-insts"] = true;
|
||||
Features["fp8-conversion-insts"] = true;
|
||||
Features["wmma-128b-insts"] = true;
|
||||
Features["atomic-fmin-fmax-global-f32"] = true;
|
||||
break;
|
||||
case GK_GFX1170:
|
||||
// TODO-GFX1170: Update features map for gfx1170
|
||||
Features["ci-insts"] = true;
|
||||
Features["dot5-insts"] = true;
|
||||
Features["dot7-insts"] = true;
|
||||
Features["dot8-insts"] = true;
|
||||
Features["dot9-insts"] = true;
|
||||
Features["dot10-insts"] = true;
|
||||
Features["dot12-insts"] = true;
|
||||
Features["dl-insts"] = true;
|
||||
Features["16-bit-insts"] = true;
|
||||
Features["dpp"] = true;
|
||||
Features["gfx8-insts"] = true;
|
||||
Features["gfx9-insts"] = true;
|
||||
Features["gfx10-insts"] = true;
|
||||
Features["gfx10-3-insts"] = true;
|
||||
Features["gfx11-insts"] = true;
|
||||
Features["atomic-fadd-rtn-insts"] = true;
|
||||
Features["image-insts"] = true;
|
||||
Features["cube-insts"] = true;
|
||||
Features["lerp-inst"] = true;
|
||||
Features["sad-insts"] = true;
|
||||
Features["qsad-insts"] = true;
|
||||
Features["cvt-pknorm-vop2-insts"] = true;
|
||||
Features["gws"] = true;
|
||||
Features["dot11-insts"] = true;
|
||||
Features["fp8-conversion-insts"] = true;
|
||||
[[fallthrough]];
|
||||
Features["wmma-128b-insts"] = true;
|
||||
Features["atomic-fmin-fmax-global-f32"] = true;
|
||||
break;
|
||||
case GK_GFX1153:
|
||||
case GK_GFX1152:
|
||||
case GK_GFX1151:
|
||||
@ -554,6 +579,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
|
||||
Features["qsad-insts"] = true;
|
||||
Features["cvt-pknorm-vop2-insts"] = true;
|
||||
Features["gws"] = true;
|
||||
Features["wmma-256b-insts"] = true;
|
||||
Features["atomic-fmin-fmax-global-f32"] = true;
|
||||
break;
|
||||
case GK_GFX1036:
|
||||
|
||||
@ -1,14 +1,15 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
|
||||
@ -17,13 +18,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
|
||||
@ -32,13 +33,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
|
||||
@ -47,13 +48,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
|
||||
@ -62,13 +63,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
|
||||
@ -77,13 +78,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
|
||||
@ -92,11 +93,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
|
||||
@ -105,11 +106,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
|
||||
@ -118,11 +119,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x half> %C
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
|
||||
@ -131,11 +132,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
|
||||
@ -144,13 +145,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
@ -159,13 +160,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
@ -174,13 +175,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
@ -189,13 +190,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
@ -204,13 +205,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
@ -219,13 +220,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
@ -234,13 +235,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
@ -249,13 +250,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
@ -264,13 +265,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
|
||||
@ -279,13 +280,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <16 x half> %B
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
|
||||
@ -294,11 +295,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
|
||||
@ -307,11 +308,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <16 x half> %B
|
||||
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
|
||||
@ -322,13 +323,13 @@ bb:
|
||||
; both neg and abs patterns (wmma matrix C f32 or f16 )
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%fneg.fabs.C = fneg <8 x float> %fabs.C
|
||||
@ -338,11 +339,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
|
||||
%fneg.fabs.C = fneg <8 x half> %fabs.C
|
||||
@ -352,15 +353,15 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%el3 = extractelement <8 x float> %C, i32 3
|
||||
%el3.fabs = call float @llvm.fabs.f32(float %el3)
|
||||
@ -374,13 +375,13 @@ bb:
|
||||
; A or B matrix modifier and constant in C
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
@ -389,11 +390,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
@ -404,6 +405,27 @@ bb:
|
||||
; pack f16 elements with v_perm_b32 since they don't come from same b32
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: flat_load_b128 v[12:15], v[8:9]
|
||||
; GFX1170-NEXT: flat_load_b128 v[16:19], v[8:9] offset:16
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
|
||||
; GFX1170-NEXT: v_and_b32_e32 v8, 0xffff, v12
|
||||
; GFX1170-NEXT: v_and_b32_e32 v9, 0xffff, v14
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1170-NEXT: v_and_b32_e32 v14, 0xffff, v16
|
||||
; GFX1170-NEXT: v_and_b32_e32 v16, 0xffff, v18
|
||||
; GFX1170-NEXT: v_lshl_or_b32 v12, v13, 16, v8
|
||||
; GFX1170-NEXT: v_lshl_or_b32 v13, v15, 16, v9
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX1170-NEXT: v_lshl_or_b32 v14, v17, 16, v14
|
||||
; GFX1170-NEXT: v_lshl_or_b32 v15, v19, 16, v16
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
|
||||
; GFX1170-NEXT: global_store_b128 v[10:11], v[12:15], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
|
||||
@ -1,14 +1,15 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -16,27 +17,27 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s7, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s0
|
||||
; GCN-NEXT: s_mov_b32 s6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
|
||||
; GCN-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
|
||||
; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -44,13 +45,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -58,27 +59,27 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s7, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s0
|
||||
; GCN-NEXT: s_mov_b32 s6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
|
||||
; GCN-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
|
||||
; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -86,11 +87,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
@ -98,19 +99,19 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x42004200
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x42004200
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
@ -118,19 +119,19 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x3f803f80
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
@ -138,19 +139,19 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x3fc03fc0
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
|
||||
; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
@ -158,13 +159,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -172,27 +173,27 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_movk_i32 s0, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_movk_i32 s0, 0x80
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s7, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s0
|
||||
; GCN-NEXT: s_mov_b32 s6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -200,13 +201,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -214,27 +215,27 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_movk_i32 s0, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_movk_i32 s0, 0x80
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s7, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s0
|
||||
; GCN-NEXT: s_mov_b32 s6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
|
||||
; GCN-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
|
||||
; GCN-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
|
||||
; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -242,13 +243,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -256,27 +257,27 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s7, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s0
|
||||
; GCN-NEXT: s_mov_b32 s6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -284,13 +285,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -298,27 +299,27 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s7, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s0
|
||||
; GCN-NEXT: s_mov_b32 s6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -326,13 +327,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -340,27 +341,27 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s7, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s0
|
||||
; GCN-NEXT: s_mov_b32 s6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -368,13 +369,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -382,27 +383,27 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s7, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s0
|
||||
; GCN-NEXT: s_mov_b32 s6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -410,13 +411,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -424,27 +425,27 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_movk_i32 s0, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s7, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s0
|
||||
; GFX12-NEXT: s_mov_b32 s5, s0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_movk_i32 s0, 0x80
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s7, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s0
|
||||
; GCN-NEXT: s_mov_b32 s6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
|
||||
; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
|
||||
; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
|
||||
; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -473,3 +474,6 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX1170: {{.*}}
|
||||
; GFX12: {{.*}}
|
||||
|
||||
@ -1,14 +1,15 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -16,13 +17,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -30,13 +31,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -46,13 +47,13 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -60,13 +61,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -74,13 +75,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -90,13 +91,13 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -104,13 +105,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -118,13 +119,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -136,13 +137,13 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -150,13 +151,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -164,13 +165,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -180,13 +181,13 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -194,13 +195,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -208,13 +209,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -224,13 +225,13 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -238,13 +239,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -252,13 +253,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -271,3 +272,6 @@ declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX1170: {{.*}}
|
||||
; GFX12: {{.*}}
|
||||
|
||||
@ -1,7 +1,27 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v20, v[20:21], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[22:23], v[26:29], off
|
||||
; GFX1170-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[24:25], v[12:15], off
|
||||
; GFX1170-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
|
||||
@ -32,6 +52,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v20, v[20:21], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[22:23], v[26:29], off
|
||||
; GFX1170-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[24:25], v[12:15], off
|
||||
; GFX1170-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
|
||||
@ -62,6 +101,19 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v16, v[16:17], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
|
||||
; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[22:25], off
|
||||
; GFX1170-NEXT: global_store_b128 v[20:21], v[12:15], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
|
||||
@ -86,6 +138,19 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v16, v[16:17], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
|
||||
; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[22:25], off
|
||||
; GFX1170-NEXT: global_store_b128 v[20:21], v[12:15], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
|
||||
@ -110,6 +175,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
@ -140,6 +224,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v11, v[11:12], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[17:20], off
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v11, v[11:12], off
|
||||
@ -170,6 +273,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
@ -200,6 +322,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
@ -230,6 +371,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
@ -260,6 +420,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
@ -299,3 +478,5 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GCN: {{.*}}
|
||||
|
||||
@ -1,14 +1,15 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -16,13 +17,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -30,11 +31,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
@ -42,11 +43,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_bf16_16x16x16_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
@ -54,13 +55,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -68,13 +69,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -82,13 +83,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -96,13 +97,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -110,13 +111,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -124,13 +125,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -138,13 +139,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -153,13 +154,13 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -167,13 +168,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -181,11 +182,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f16_16x16x32_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
|
||||
; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
@ -193,11 +194,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
|
||||
; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
@ -205,13 +206,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -219,13 +220,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -233,13 +234,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -247,13 +248,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -261,13 +262,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -275,13 +276,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -289,13 +290,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -324,3 +325,6 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX1170: {{.*}}
|
||||
; GFX12: {{.*}}
|
||||
|
||||
@ -1,12 +1,13 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64,-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
|
||||
@ -15,11 +16,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <4 x half> %B
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
|
||||
@ -28,11 +29,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
|
||||
@ -41,11 +42,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
|
||||
@ -54,11 +55,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
|
||||
@ -67,11 +68,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
|
||||
@ -80,11 +81,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
|
||||
@ -93,11 +94,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <4 x half> %B
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
|
||||
@ -106,11 +107,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x half> %C
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
|
||||
@ -119,11 +120,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
|
||||
@ -132,11 +133,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
@ -145,11 +146,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
@ -158,11 +159,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
@ -171,11 +172,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
@ -184,11 +185,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
@ -197,11 +198,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
@ -210,11 +211,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
@ -223,11 +224,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
@ -236,11 +237,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
|
||||
@ -249,11 +250,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
|
||||
@ -262,11 +263,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
|
||||
@ -275,11 +276,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
|
||||
@ -290,11 +291,11 @@ bb:
|
||||
; both neg and abs patterns (wmma matrix C f32 or f16 )
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%fneg.fabs.C = fneg <4 x float> %fabs.C
|
||||
@ -304,11 +305,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
|
||||
%fneg.fabs.C = fneg <4 x half> %fabs.C
|
||||
@ -318,13 +319,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%el3 = extractelement <4 x float> %C, i32 3
|
||||
%el3.fabs = call float @llvm.fabs.f32(float %el3)
|
||||
@ -338,11 +339,11 @@ bb:
|
||||
; A or B matrix modifier and constant in C
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
@ -351,11 +352,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <4 x half> %B
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
@ -366,6 +367,20 @@ bb:
|
||||
; pack f16 elements with v_perm_b32 since they don't come from same b32
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: flat_load_b128 v[8:11], v[4:5]
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1170-NEXT: v_and_b32_e32 v4, 0xffff, v8
|
||||
; GFX1170-NEXT: v_and_b32_e32 v5, 0xffff, v10
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX1170-NEXT: v_lshl_or_b32 v4, v9, 16, v4
|
||||
; GFX1170-NEXT: v_lshl_or_b32 v5, v11, 16, v5
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
|
||||
; GFX1170-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5]
|
||||
|
||||
@ -1,12 +1,13 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -14,21 +15,21 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v9, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v9, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -36,11 +37,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -48,21 +49,21 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v9, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v9, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -70,11 +71,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
@ -82,17 +83,17 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x42004200
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x42004200
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
@ -100,17 +101,17 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x3f803f80
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
@ -118,17 +119,17 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x3fc03fc0
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
@ -136,11 +137,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -148,21 +149,21 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_movk_i32 s0, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_movk_i32 s0, 0x80
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -170,11 +171,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -182,21 +183,21 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_movk_i32 s0, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_movk_i32 s0, 0x80
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -204,11 +205,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -216,21 +217,21 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -238,11 +239,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -250,21 +251,21 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -272,11 +273,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -284,21 +285,21 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -306,11 +307,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -318,21 +319,21 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -340,11 +341,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -352,21 +353,21 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_movk_i32 s0, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s1, s0
|
||||
; GFX12-NEXT: s_mov_b32 s2, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_movk_i32 s0, 0x80
|
||||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GCN-NEXT: s_mov_b32 s3, s0
|
||||
; GCN-NEXT: s_mov_b32 s1, s0
|
||||
; GCN-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -384,3 +385,6 @@ declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX1170: {{.*}}
|
||||
; GFX12: {{.*}}
|
||||
|
||||
@ -1,12 +1,13 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -14,11 +15,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -26,11 +27,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -40,11 +41,11 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -52,11 +53,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -64,11 +65,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -78,11 +79,11 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -90,11 +91,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -102,11 +103,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -119,11 +120,11 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -131,11 +132,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -143,11 +144,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -157,11 +158,11 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -169,11 +170,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -181,11 +182,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
|
||||
; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -195,11 +196,11 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -207,11 +208,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -219,11 +220,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -236,3 +237,6 @@ declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 imma
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX1170: {{.*}}
|
||||
; GFX12: {{.*}}
|
||||
|
||||
@ -1,7 +1,35 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v10, v[10:11], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v9
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v8
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v9
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v8
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v31, v9
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v30, v8
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v29, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[12:13], v[20:23], off
|
||||
; GFX1170-NEXT: global_store_b128 v[14:15], v[24:27], off
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[28:31], off
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
|
||||
@ -46,6 +74,33 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v10, v[10:11], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v9
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v8
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v9
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v8
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v31, v9
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v30, v8
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v29, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[12:13], v[20:23], off
|
||||
; GFX1170-NEXT: global_store_b128 v[14:15], v[24:27], off
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[28:31], off
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
|
||||
@ -90,6 +145,27 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v22, v[8:9], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
|
||||
; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
|
||||
; GFX1170-NEXT: global_store_b64 v[10:11], v[8:9], off
|
||||
; GFX1170-NEXT: global_store_b64 v[12:13], v[18:19], off
|
||||
; GFX1170-NEXT: global_store_b64 v[14:15], v[20:21], off
|
||||
; GFX1170-NEXT: global_store_b64 v[16:17], v[6:7], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
|
||||
@ -128,6 +204,27 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v22, v[8:9], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
|
||||
; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
|
||||
; GFX1170-NEXT: global_store_b64 v[10:11], v[8:9], off
|
||||
; GFX1170-NEXT: global_store_b64 v[12:13], v[18:19], off
|
||||
; GFX1170-NEXT: global_store_b64 v[14:15], v[20:21], off
|
||||
; GFX1170-NEXT: global_store_b64 v[16:17], v[6:7], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
|
||||
@ -166,6 +263,33 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
@ -210,6 +334,21 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v6, v[6:7], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v15, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v14, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v12, v2
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
|
||||
; GFX1170-NEXT: global_store_b128 v[8:9], v[12:15], off
|
||||
; GFX1170-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v6, v[6:7], off
|
||||
@ -236,6 +375,21 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v16, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v15, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v14, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
|
||||
; GFX1170-NEXT: global_store_b128 v[9:10], v[13:16], off
|
||||
; GFX1170-NEXT: global_store_b128 v[11:12], v[3:6], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
@ -262,6 +416,33 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
@ -306,6 +487,33 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
@ -350,6 +558,33 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
@ -394,6 +629,33 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
@ -448,3 +710,5 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GCN: {{.*}}
|
||||
|
||||
@ -1,12 +1,13 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -14,11 +15,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -26,11 +27,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
@ -38,11 +39,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_bf16_16x16x16_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
@ -50,11 +51,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -62,11 +63,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -74,11 +75,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -86,11 +87,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -98,11 +99,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -110,11 +111,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -122,11 +123,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -134,11 +135,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
|
||||
; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -146,11 +147,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
|
||||
; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -158,11 +159,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f16_16x16x32_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
|
||||
; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
@ -170,11 +171,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
|
||||
; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
@ -182,11 +183,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -194,11 +195,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
|
||||
; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -206,11 +207,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -218,11 +219,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -230,11 +231,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -242,11 +243,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -254,11 +255,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -287,3 +288,6 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX1170: {{.*}}
|
||||
; GFX12: {{.*}}
|
||||
|
||||
@ -1,14 +1,15 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
|
||||
@ -17,13 +18,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
|
||||
@ -32,13 +33,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
|
||||
@ -47,13 +48,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
|
||||
@ -62,13 +63,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
|
||||
@ -77,13 +78,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
|
||||
@ -92,11 +93,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
|
||||
@ -105,11 +106,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
|
||||
@ -118,11 +119,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x half> %C
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
|
||||
@ -131,11 +132,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
|
||||
@ -144,13 +145,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
@ -159,13 +160,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
@ -174,13 +175,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
@ -189,13 +190,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
@ -204,13 +205,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
@ -219,13 +220,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
@ -234,13 +235,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <8 x float> %C
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
|
||||
@ -249,13 +250,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
|
||||
@ -264,13 +265,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
|
||||
@ -279,13 +280,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <16 x half> %B
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
|
||||
@ -294,11 +295,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
|
||||
@ -307,11 +308,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <16 x half> %B
|
||||
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
|
||||
@ -322,13 +323,13 @@ bb:
|
||||
; both neg and abs patterns (wmma matrix C f32 or f16 )
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
|
||||
%fneg.fabs.C = fneg <8 x float> %fabs.C
|
||||
@ -338,11 +339,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
|
||||
%fneg.fabs.C = fneg <8 x half> %fabs.C
|
||||
@ -352,15 +353,15 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%el3 = extractelement <8 x float> %C, i32 3
|
||||
%el3.fabs = call float @llvm.fabs.f32(float %el3)
|
||||
@ -374,13 +375,13 @@ bb:
|
||||
; A or B matrix modifier and constant in C
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <8 x half> %A
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
@ -389,11 +390,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
@ -404,6 +405,24 @@ bb:
|
||||
; pack f16 elements with v_perm_b32 since they don't come from same b32
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: flat_load_b128 v[12:15], v[8:9] offset:16
|
||||
; GFX1170-NEXT: flat_load_b128 v[16:19], v[8:9]
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
|
||||
; GFX1170-NEXT: v_mov_b16_e32 v8.l, v15.l
|
||||
; GFX1170-NEXT: v_mov_b16_e32 v9.l, v14.l
|
||||
; GFX1170-NEXT: v_perm_b32 v14, v13, v12, 0x5040100
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1170-NEXT: v_perm_b32 v13, v19, v18, 0x5040100
|
||||
; GFX1170-NEXT: v_perm_b32 v12, v17, v16, 0x5040100
|
||||
; GFX1170-NEXT: v_perm_b32 v15, v8, v9, 0x5040100
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
|
||||
; GFX1170-NEXT: global_store_b128 v[10:11], v[12:15], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
|
||||
@ -1,14 +1,15 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -16,6 +17,24 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v10, 0x40400000
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v11, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v12, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v14, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v15, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v16, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v17, v10
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
|
||||
@ -36,13 +55,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -50,6 +69,24 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v10, 0x40400000
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v11, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v12, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v14, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v15, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v16, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v17, v10
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
|
||||
@ -70,11 +107,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
@ -82,6 +119,17 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v10, 0x42004200
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v11, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v12, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v10
|
||||
; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v10, 0x42004200
|
||||
@ -98,6 +146,17 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v10, 0x3f803f80
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v11, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v12, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v10
|
||||
; GFX1170-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v10, 0x3f803f80
|
||||
@ -114,6 +173,17 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v11, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v12, v10
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v10
|
||||
; GFX1170-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
|
||||
; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0
|
||||
@ -130,13 +200,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -144,6 +214,24 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v6, 0x80
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v9, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v10, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v11, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v12, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v6
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
|
||||
@ -164,13 +252,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -178,6 +266,24 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v4, 0x80
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v8, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v9, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v10, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v11, v4
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
|
||||
@ -198,13 +304,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -212,6 +318,24 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v9, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v10, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v11, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v12, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v6
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
@ -232,13 +356,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -246,6 +370,24 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v9, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v10, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v11, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v12, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v6
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
@ -266,13 +408,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -280,6 +422,24 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v9, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v10, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v11, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v12, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v6
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
@ -300,13 +460,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -314,6 +474,24 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v9, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v10, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v11, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v12, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v6
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
@ -334,13 +512,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -348,6 +526,24 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
|
||||
; GFX1170-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v6, 0x80
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v9, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v10, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v11, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v12, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v6
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
|
||||
|
||||
@ -1,14 +1,15 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -16,13 +17,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -30,13 +31,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -46,13 +47,13 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -60,13 +61,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -74,13 +75,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -90,13 +91,13 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -104,13 +105,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -118,13 +119,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -136,13 +137,13 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -150,13 +151,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -164,13 +165,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -180,13 +181,13 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -194,13 +195,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -208,13 +209,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -224,13 +225,13 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -238,13 +239,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -252,13 +253,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -271,3 +272,6 @@ declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 immarg, <2 x
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
|
||||
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX1170: {{.*}}
|
||||
; GFX12: {{.*}}
|
||||
|
||||
@ -1,7 +1,27 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v20, v[20:21], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[22:23], v[26:29], off
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[24:25], v[12:15], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
|
||||
@ -32,6 +52,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v20, v[20:21], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[22:23], v[26:29], off
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[24:25], v[12:15], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
|
||||
@ -62,6 +101,19 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v16, v[16:17], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
|
||||
; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[22:25], off
|
||||
; GFX1170-NEXT: global_store_b128 v[20:21], v[12:15], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
|
||||
@ -86,6 +138,19 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v16, v[16:17], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
|
||||
; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[22:25], off
|
||||
; GFX1170-NEXT: global_store_b128 v[20:21], v[12:15], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
|
||||
@ -110,6 +175,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
@ -140,6 +224,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v11, v[11:12], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[17:20], off
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v11, v[11:12], off
|
||||
@ -170,6 +273,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
@ -200,6 +322,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
@ -230,6 +371,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
@ -260,6 +420,25 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
|
||||
; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||
; GFX1170-NEXT: s_clause 0x1
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
|
||||
@ -299,3 +478,5 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GCN: {{.*}}
|
||||
|
||||
@ -1,14 +1,15 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -16,13 +17,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -30,11 +31,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
@ -42,11 +43,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_bf16_16x16x16_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
@ -54,13 +55,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -68,13 +69,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -82,13 +83,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -96,13 +97,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -110,13 +111,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -124,13 +125,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -138,13 +139,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -153,13 +154,13 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -167,13 +168,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -181,11 +182,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f16_16x16x32_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
|
||||
; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
|
||||
store <8 x half> %res, ptr addrspace(1) %out
|
||||
@ -193,11 +194,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
|
||||
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
|
||||
; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
|
||||
store <8 x i16> %res, ptr addrspace(1) %out
|
||||
@ -205,13 +206,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -219,13 +220,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -233,13 +234,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
|
||||
store <8 x i32> %res, ptr addrspace(1) %out
|
||||
@ -247,13 +248,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -261,13 +262,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -275,13 +276,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -289,13 +290,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
|
||||
; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
@ -324,3 +325,6 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX1170: {{.*}}
|
||||
; GFX12: {{.*}}
|
||||
|
||||
@ -1,13 +1,15 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GFX12,GFX12-TRUE16
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170-TRUE16
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170-FAKE16
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12-TRUE16
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12-FAKE16
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
|
||||
@ -16,11 +18,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <4 x half> %B
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
|
||||
@ -29,11 +31,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
|
||||
@ -42,11 +44,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
|
||||
@ -55,11 +57,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
|
||||
@ -68,11 +70,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
|
||||
@ -81,11 +83,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
|
||||
@ -94,11 +96,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <4 x half> %B
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
|
||||
@ -107,11 +109,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x half> %C
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
|
||||
@ -120,11 +122,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
|
||||
@ -133,11 +135,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
@ -146,11 +148,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
@ -159,11 +161,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
@ -172,11 +174,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
@ -185,11 +187,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
@ -198,11 +200,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
@ -211,11 +213,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.C = fneg <4 x float> %C
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
|
||||
@ -224,11 +226,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
|
||||
@ -237,11 +239,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
|
||||
@ -250,11 +252,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
|
||||
@ -263,11 +265,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
|
||||
@ -276,11 +278,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <8 x half> %B
|
||||
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
|
||||
@ -291,11 +293,11 @@ bb:
|
||||
; both neg and abs patterns (wmma matrix C f32 or f16 )
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
|
||||
%fneg.fabs.C = fneg <4 x float> %fabs.C
|
||||
@ -305,11 +307,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
|
||||
%fneg.fabs.C = fneg <4 x half> %fabs.C
|
||||
@ -319,13 +321,13 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%el3 = extractelement <4 x float> %C, i32 3
|
||||
%el3.fabs = call float @llvm.fabs.f32(float %el3)
|
||||
@ -339,11 +341,11 @@ bb:
|
||||
; A or B matrix modifier and constant in C
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.A = fneg <4 x half> %A
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
@ -352,11 +354,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%fneg.B = fneg <4 x half> %B
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
@ -367,6 +369,29 @@ bb:
|
||||
; pack f16 elements with v_perm_b32 since they don't come from same b32
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
|
||||
; GFX1170-TRUE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
|
||||
; GFX1170-TRUE16: ; %bb.0: ; %bb
|
||||
; GFX1170-TRUE16-NEXT: flat_load_b128 v[8:11], v[4:5]
|
||||
; GFX1170-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1170-TRUE16-NEXT: v_mov_b16_e32 v10.h, v11.l
|
||||
; GFX1170-TRUE16-NEXT: v_mov_b16_e32 v8.h, v9.l
|
||||
; GFX1170-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1170-TRUE16-NEXT: v_mov_b32_e32 v9, v10
|
||||
; GFX1170-TRUE16-NEXT: v_wmma_f16_16x16x16_f16 v[8:9], v[0:1], v[2:3], v[8:9] neg_lo:[0,0,1]
|
||||
; GFX1170-TRUE16-NEXT: global_store_b64 v[6:7], v[8:9], off
|
||||
; GFX1170-TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1170-FAKE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
|
||||
; GFX1170-FAKE16: ; %bb.0: ; %bb
|
||||
; GFX1170-FAKE16-NEXT: flat_load_b128 v[8:11], v[4:5]
|
||||
; GFX1170-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1170-FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
|
||||
; GFX1170-FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
|
||||
; GFX1170-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-FAKE16-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
|
||||
; GFX1170-FAKE16-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX1170-FAKE16-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-TRUE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
|
||||
; GFX12-TRUE16: ; %bb.0: ; %bb
|
||||
; GFX12-TRUE16-NEXT: flat_load_b128 v[8:11], v[4:5]
|
||||
|
||||
@ -1,12 +1,13 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -14,16 +15,16 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v9, v6
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GCN-NEXT: v_mov_b32_e32 v9, v6
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -31,11 +32,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -43,16 +44,16 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v9, v6
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
|
||||
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GCN-NEXT: v_mov_b32_e32 v9, v6
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
|
||||
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -60,11 +61,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
|
||||
; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
@ -72,14 +73,14 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x42004200
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, 0x42004200
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
@ -87,14 +88,14 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x3f803f80
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, 0x3f803f80
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
@ -102,14 +103,14 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, 0x3fc03fc0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, 0x3fc03fc0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, v6
|
||||
; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
|
||||
; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
@ -117,11 +118,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -129,16 +130,16 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0x80
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -146,11 +147,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -158,16 +159,16 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0x80
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -175,11 +176,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -187,16 +188,16 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -204,11 +205,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -216,16 +217,16 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -233,11 +234,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -245,16 +246,16 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -262,11 +263,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -274,16 +275,16 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0x40400000
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -291,11 +292,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -303,16 +304,16 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0x80
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, v4
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -330,3 +331,6 @@ declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32, i32, <
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32, i32, <4 x float>)
|
||||
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32, i32, <4 x float>)
|
||||
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX1170: {{.*}}
|
||||
; GFX12: {{.*}}
|
||||
|
||||
@ -1,12 +1,13 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -14,11 +15,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -26,11 +27,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -40,11 +41,11 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -52,11 +53,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -64,11 +65,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -78,11 +79,11 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -90,11 +91,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -102,11 +103,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -114,11 +115,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -126,11 +127,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -138,11 +139,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -152,11 +153,11 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -164,11 +165,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -176,11 +177,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
|
||||
; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -190,11 +191,11 @@ bb:
|
||||
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -202,11 +203,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -214,11 +215,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -231,3 +232,6 @@ declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 immarg, i32, i
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
|
||||
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX1170: {{.*}}
|
||||
; GFX12: {{.*}}
|
||||
|
||||
@ -1,7 +1,35 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v10, v[10:11], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v9
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v8
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v9
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v8
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v31, v9
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v30, v8
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v29, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[12:13], v[20:23], off
|
||||
; GFX1170-NEXT: global_store_b128 v[14:15], v[24:27], off
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[28:31], off
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
|
||||
@ -46,6 +74,33 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v10, v[10:11], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v9
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v8
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v9
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v8
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v31, v9
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v30, v8
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v29, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[12:13], v[20:23], off
|
||||
; GFX1170-NEXT: global_store_b128 v[14:15], v[24:27], off
|
||||
; GFX1170-NEXT: global_store_b128 v[16:17], v[28:31], off
|
||||
; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
|
||||
@ -90,6 +145,27 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v22, v[8:9], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
|
||||
; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
|
||||
; GFX1170-NEXT: global_store_b64 v[10:11], v[8:9], off
|
||||
; GFX1170-NEXT: global_store_b64 v[12:13], v[18:19], off
|
||||
; GFX1170-NEXT: global_store_b64 v[14:15], v[20:21], off
|
||||
; GFX1170-NEXT: global_store_b64 v[16:17], v[6:7], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
|
||||
@ -128,6 +204,27 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v22, v[8:9], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v7
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
|
||||
; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
|
||||
; GFX1170-NEXT: global_store_b64 v[10:11], v[8:9], off
|
||||
; GFX1170-NEXT: global_store_b64 v[12:13], v[18:19], off
|
||||
; GFX1170-NEXT: global_store_b64 v[14:15], v[20:21], off
|
||||
; GFX1170-NEXT: global_store_b64 v[16:17], v[6:7], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
|
||||
@ -166,6 +263,33 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
@ -210,6 +334,21 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v6, v[6:7], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v15, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v14, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v12, v2
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
|
||||
; GFX1170-NEXT: global_store_b128 v[8:9], v[12:15], off
|
||||
; GFX1170-NEXT: global_store_b128 v[10:11], v[2:5], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v6, v[6:7], off
|
||||
@ -236,6 +375,21 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
|
||||
; GFX1170-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v16, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v15, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v14, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v13, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
|
||||
; GFX1170-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
|
||||
; GFX1170-NEXT: global_store_b128 v[9:10], v[13:16], off
|
||||
; GFX1170-NEXT: global_store_b128 v[11:12], v[3:6], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
@ -262,6 +416,33 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
@ -306,6 +487,33 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
@ -350,6 +558,33 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
@ -394,6 +629,33 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
|
||||
; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
|
||||
; GFX1170: ; %bb.0: ; %bb
|
||||
; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v20, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v19, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v18, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v17, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v24, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v23, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v22, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v21, v3
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v28, v6
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v27, v5
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v26, v4
|
||||
; GFX1170-NEXT: v_mov_b32_e32 v25, v3
|
||||
; GFX1170-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
|
||||
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
|
||||
; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
|
||||
; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
|
||||
; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
|
||||
; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
|
||||
; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
|
||||
; GFX1170-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
|
||||
@ -448,3 +710,5 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GCN: {{.*}}
|
||||
|
||||
@ -1,12 +1,13 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -14,11 +15,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -26,11 +27,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f16_16x16x16_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
@ -38,11 +39,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
|
||||
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_bf16_16x16x16_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
|
||||
; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
@ -50,11 +51,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -62,11 +63,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x16_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -74,11 +75,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -86,11 +87,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -98,11 +99,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -110,11 +111,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -122,11 +123,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
|
||||
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_wmma_i32_16x16x32_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
|
||||
; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -134,11 +135,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
|
||||
; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -146,11 +147,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
|
||||
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
|
||||
; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -158,11 +159,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f16_16x16x32_f16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
|
||||
; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
|
||||
store <4 x half> %res, ptr addrspace(1) %out
|
||||
@ -170,11 +171,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
|
||||
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
|
||||
; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
|
||||
store <4 x i16> %res, ptr addrspace(1) %out
|
||||
@ -182,11 +183,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -194,11 +195,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
|
||||
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x32_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
|
||||
; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -206,11 +207,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_i32_16x16x64_iu4:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
|
||||
store <4 x i32> %res, ptr addrspace(1) %out
|
||||
@ -218,11 +219,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -230,11 +231,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -242,11 +243,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -254,11 +255,11 @@ bb:
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
|
||||
; GFX12: ; %bb.0: ; %bb
|
||||
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
|
||||
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
|
||||
; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
|
||||
store <4 x float> %res, ptr addrspace(1) %out
|
||||
@ -287,3 +288,6 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX1170: {{.*}}
|
||||
; GFX12: {{.*}}
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX1170 %s
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s
|
||||
|
||||
# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
|
||||
# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
|
||||
@ -11,12 +12,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
|
||||
...
|
||||
@ -27,12 +28,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
|
||||
...
|
||||
@ -43,11 +44,11 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
...
|
||||
@ -58,12 +59,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
...
|
||||
@ -73,12 +74,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
...
|
||||
@ -89,11 +90,11 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
...
|
||||
@ -104,12 +105,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
...
|
||||
@ -120,12 +121,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
...
|
||||
@ -136,11 +137,11 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
...
|
||||
@ -151,12 +152,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
...
|
||||
@ -167,12 +168,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
|
||||
...
|
||||
@ -183,11 +184,11 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
...
|
||||
@ -198,6 +199,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
|
||||
|
||||
; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
|
||||
; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
|
||||
; GFX1170-NEXT: {{ $}}
|
||||
; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX1170-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec
|
||||
;
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
|
||||
; GFX12-NEXT: {{ $}}
|
||||
@ -214,12 +221,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
...
|
||||
@ -230,12 +237,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
|
||||
...
|
||||
@ -246,11 +253,11 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
|
||||
...
|
||||
@ -261,6 +268,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
|
||||
; GFX1170-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
|
||||
; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX1170-NEXT: {{ $}}
|
||||
; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX1170-NEXT: early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec
|
||||
;
|
||||
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
|
||||
; GFX12-NEXT: {{ $}}
|
||||
@ -277,12 +290,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
|
||||
...
|
||||
@ -293,12 +306,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
|
||||
...
|
||||
@ -309,11 +322,11 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
|
||||
...
|
||||
@ -324,6 +337,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
|
||||
; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
|
||||
; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GFX1170-NEXT: {{ $}}
|
||||
; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX1170-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec
|
||||
;
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GFX12-NEXT: {{ $}}
|
||||
@ -340,12 +359,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
|
||||
...
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX1170 %s
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s
|
||||
|
||||
# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
|
||||
# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
|
||||
@ -11,12 +12,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||
...
|
||||
@ -27,12 +28,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||
...
|
||||
@ -43,11 +44,11 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
...
|
||||
@ -58,12 +59,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
...
|
||||
@ -74,12 +75,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
...
|
||||
@ -90,11 +91,11 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
...
|
||||
@ -105,12 +106,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
...
|
||||
@ -121,12 +122,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
...
|
||||
@ -137,11 +138,11 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
...
|
||||
@ -152,12 +153,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
...
|
||||
@ -168,12 +169,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
|
||||
; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
|
||||
...
|
||||
@ -184,11 +185,11 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
|
||||
...
|
||||
@ -199,6 +200,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
|
||||
|
||||
; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
|
||||
; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
|
||||
; GFX1170-NEXT: {{ $}}
|
||||
; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX1170-NEXT: early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec
|
||||
;
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
|
||||
; GFX12-NEXT: {{ $}}
|
||||
@ -215,12 +222,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
|
||||
...
|
||||
@ -231,12 +238,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
|
||||
...
|
||||
@ -247,11 +254,11 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
|
||||
...
|
||||
@ -262,6 +269,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
|
||||
|
||||
; GFX1170-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
|
||||
; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
|
||||
; GFX1170-NEXT: {{ $}}
|
||||
; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX1170-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec
|
||||
;
|
||||
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
|
||||
; GFX12-NEXT: {{ $}}
|
||||
@ -278,12 +291,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
|
||||
...
|
||||
@ -294,12 +307,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
|
||||
...
|
||||
@ -310,11 +323,11 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
|
||||
...
|
||||
@ -325,6 +338,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
|
||||
; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX1170-NEXT: {{ $}}
|
||||
; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX1170-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec
|
||||
;
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
@ -341,12 +360,12 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
|
||||
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
|
||||
; GCN-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||
; GCN-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
|
||||
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
|
||||
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
|
||||
...
|
||||
|
||||
1529
llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s
Normal file
1529
llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s
Normal file
File diff suppressed because it is too large
Load Diff
1529
llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s
Normal file
1529
llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s
Normal file
File diff suppressed because it is too large
Load Diff
@ -206,14 +206,14 @@ v_fract_f64_e32 v[0:1], lit(1.0)
|
||||
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0
|
||||
// GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0xca,0x1b]
|
||||
// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
|
||||
// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
|
||||
// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
|
||||
// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1.0)
|
||||
// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
|
||||
// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
|
||||
// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
|
||||
// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
|
||||
// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
|
||||
|
||||
@ -658,14 +658,14 @@ v_fract_f64_e32 v[0:1], 0xffffffffffffffff
|
||||
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1
|
||||
// GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
|
||||
// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
|
||||
// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
|
||||
// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
|
||||
// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
|
||||
|
||||
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1)
|
||||
// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
|
||||
// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
|
||||
// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
|
||||
// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
|
||||
// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
|
||||
// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
|
||||
|
||||
|
||||
1628
llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt
Normal file
1628
llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt
Normal file
File diff suppressed because it is too large
Load Diff
1628
llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt
Normal file
1628
llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user