[AMDGPU] Remove neg support from 4 more gfx1250 WMMA (#189115)

These are previously covered by AMDGPUWmmaIntrinsicModsAllReuse.
This commit is contained in:
Stanislav Mekhanoshin 2026-03-27 15:20:14 -07:00 committed by GitHub
parent 044876423b
commit a2d84b5d8d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 216 additions and 329 deletions

View File

@ -1605,8 +1605,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
break;
// GFX1250 WMMA builtins
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
ArgsForMatchingMatrixTypes = {5, 1};
ArgsForMatchingMatrixTypes = {3, 0};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
RemoveABNeg = true;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
ArgsForMatchingMatrixTypes = {3, 0};
@ -1614,16 +1615,19 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
RemoveABNeg = true;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
ArgsForMatchingMatrixTypes = {5, 1};
ArgsForMatchingMatrixTypes = {3, 0};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16;
RemoveABNeg = true;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
ArgsForMatchingMatrixTypes = {5, 1};
ArgsForMatchingMatrixTypes = {3, 0};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16;
RemoveABNeg = true;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
ArgsForMatchingMatrixTypes = {5, 1};
ArgsForMatchingMatrixTypes = {3, 0};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16;
RemoveABNeg = true;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
NeedReturnType = true;

View File

@ -367,6 +367,10 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
return false;
}
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
return SemaRef.BuiltinConstantArgRange(TheCall, /*ArgNum=*/0, /*Low=*/0,
/*High=*/0) ||
SemaRef.BuiltinConstantArgRange(TheCall, /*ArgNum=*/2, /*Low=*/0,

View File

@ -246,7 +246,7 @@ __device__ void test_wmma_f16_16x16x128_bf8_bf8(v8h *out, v16i a, v16i b, v8h c)
// CHECK-NEXT: [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR_ASCAST]], align 32
// CHECK-NEXT: [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR_ASCAST]], align 32
// CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr [[C_ADDR_ASCAST]], align 32
// CHECK-NEXT: [[TMP3:%.*]] = call contract <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> [[TMP0]], i1 false, <16 x half> [[TMP1]], i16 0, <8 x float> [[TMP2]], i1 false, i1 true)
// CHECK-NEXT: [[TMP3:%.*]] = call contract <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half> [[TMP0]], <16 x half> [[TMP1]], i16 0, <8 x float> [[TMP2]], i1 false, i1 true)
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[TMP4]], align 32
// CHECK-NEXT: ret void
@ -273,7 +273,7 @@ __device__ void test_wmma_f32_16x16x32_f16(v8f *out, v16h a, v16h b, v8f c) {
// CHECK-NEXT: [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR_ASCAST]], align 32
// CHECK-NEXT: [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR_ASCAST]], align 32
// CHECK-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR_ASCAST]], align 16
// CHECK-NEXT: [[TMP3:%.*]] = call contract <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 false, <16 x half> [[TMP0]], i1 false, <16 x half> [[TMP1]], i16 0, <8 x half> [[TMP2]], i1 false, i1 true)
// CHECK-NEXT: [[TMP3:%.*]] = call contract <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half> [[TMP0]], <16 x half> [[TMP1]], i16 0, <8 x half> [[TMP2]], i1 false, i1 true)
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <8 x half> [[TMP3]], ptr [[TMP4]], align 16
// CHECK-NEXT: ret void

View File

@ -17,7 +17,7 @@ typedef int v2i __attribute__((ext_vector_type(2)));
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x4_f32(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 false, <2 x float> [[A:%.*]], i1 false, <2 x float> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> [[A:%.*]], <2 x float> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7:![0-9]+]]
// CHECK-GFX1250-NEXT: ret void
//
@ -39,7 +39,7 @@ void test_amdgcn_wmma_f32_16x16x32_bf16(global v8f* out, v16bf16 a, v16bf16 b, v
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_bf16_16x16x32_bf16(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x bfloat> [[C:%.*]], i1 false, i1 false)
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], i16 0, <8 x bfloat> [[C:%.*]], i1 false, i1 false)
// CHECK-GFX1250-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
// CHECK-GFX1250-NEXT: ret void
//
@ -207,7 +207,7 @@ void test_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_f16(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> [[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
// CHECK-GFX1250-NEXT: ret void
//
@ -218,7 +218,7 @@ void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c)
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x32_f16(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 false, <16 x half> [[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
// CHECK-GFX1250-NEXT: ret void
//

View File

@ -21,6 +21,8 @@ void test_amdgcn_wmma_f32_16x16x4_f32(global v8f* out, v2f a, v2f b, v8f c, int
*out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant integer}}
*out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant integer}}
*out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant integer}}
*out = __builtin_amdgcn_wmma_f32_16x16x4_f32(1, a, 0, b, 0, c, false, false); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
*out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 1, b, 0, c, false, false); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
}
void test_amdgcn_wmma_f32_16x16x32_bf16(global v8f* out, v16bf16 a, v16bf16 b, v8f c, int mod)
@ -41,6 +43,8 @@ void test_amdgcn_wmma_bf16_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16
*out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a constant integer}}
*out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a constant integer}}
*out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a constant integer}}
*out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(1, a, 0, b, 0, c, false, false); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
*out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 1, b, 0, c, false, false); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
}
void test_amdgcn_wmma_bf16f32_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16 b, v8f c, int mod)
@ -163,6 +167,8 @@ void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c, i
*out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
*out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
*out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
*out = __builtin_amdgcn_wmma_f32_16x16x32_f16(1, a, 0, b, 0, c, false, false); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
*out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 1, b, 0, c, false, false); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
}
void test_amdgcn_wmma_f16_16x16x32_f16(global v8h* out, v16h a, v16h b, v8h c, int mod)
@ -172,6 +178,8 @@ void test_amdgcn_wmma_f16_16x16x32_f16(global v8h* out, v16h a, v16h b, v8h c, i
*out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant integer}}
*out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant integer}}
*out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant integer}}
*out = __builtin_amdgcn_wmma_f16_16x16x32_f16(1, a, 0, b, 0, c, false, false); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
*out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 1, b, 0, c, false, false); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
}
void test_amdgcn_wmma_f16_16x16x128_fp8_fp8(global v8h* out, v16i a, v16i b, v8h c, int mod)

View File

@ -4015,23 +4015,6 @@ class AMDGPUWmmaIntrinsicModsAll<LLVMType AB, LLVMType CD> :
[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
class AMDGPUWmmaIntrinsicModsAllReuse<LLVMType AB, LLVMType CD> :
Intrinsic<
[CD], // %D
[
llvm_i1_ty, // %A_mod: 0 -- none, 1 -- neg
AB, // %A
llvm_i1_ty, // %B_mod: 0 -- none, 1 -- neg
LLVMMatchType<1>, // %B
llvm_i16_ty, // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs)
LLVMMatchType<0>, // %C
llvm_i1_ty, // matrix_a_reuse
llvm_i1_ty, // matrix_b_reuse
],
[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
// D and C are of different types.
class AMDGPUWmmaIntrinsicModsAllDiff<LLVMType DstTy, LLVMType AB, LLVMType C> :
Intrinsic<
@ -4111,11 +4094,11 @@ class AMDGPUWmmaScaleF4IntrinsicModsC<LLVMType scale_ty> :
>;
defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsC<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsC<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x32_f16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f16_16x16x32_f16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x32_f16 : AMDGPUWmmaIntrinsicModsC<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f16_16x16x32_f16 : AMDGPUWmmaIntrinsicModsC<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsC<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllDiff<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x64_fp8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x64_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;

View File

@ -1318,7 +1318,11 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
}
break;
case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
case Intrinsic::amdgcn_wmma_f32_16x16x4_f32:
case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16:
case Intrinsic::amdgcn_wmma_f32_16x16x32_f16:
case Intrinsic::amdgcn_wmma_f16_16x16x32_f16:
case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16:
if (F->arg_size() == 8) {
NewFn = nullptr;
return true;
@ -4725,7 +4729,11 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
switch (F->getIntrinsicID()) {
default:
break;
case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16: {
case Intrinsic::amdgcn_wmma_f32_16x16x4_f32:
case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16:
case Intrinsic::amdgcn_wmma_f32_16x16x32_f16:
case Intrinsic::amdgcn_wmma_f16_16x16x32_f16:
case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16: {
// Drop src0 and src1 modifiers.
const Value *Op0 = CI->getArgOperand(0);
const Value *Op2 = CI->getArgOperand(2);

View File

@ -1863,16 +1863,20 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], /*_I
// for matrix A, index is i16; Matrix B uses all lanes
def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1, /*_IsF4*/0,
/*_NoABMods*/1>;
def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1, /*_IsF4*/0,
/*_NoABMods*/1>;
def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1, /*_IsF4*/0,
/*_NoABMods*/1>;
def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1, /*_IsF4*/0,
/*_NoABMods*/1>;
def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1, /*_IsF4*/0,
/*_NoABMods*/1>;
def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1,

View File

@ -204,9 +204,9 @@ bb:
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 false, <2 x float> %A, i1 false, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false)
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> %A, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false)
define amdgpu_kernel void @wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false)
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> %A, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false)
store <8 x float> %tmp0, ptr addrspace(1) %out
ret void
}
@ -218,23 +218,23 @@ define amdgpu_kernel void @wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %A, i1 false, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false)
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false)
define amdgpu_kernel void @wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false)
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false)
store <8 x float> %tmp0, ptr addrspace(1) %out
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 false, <16 x half> %A, i1 false, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false)
; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false)
define amdgpu_kernel void @wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
%tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false)
%tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false)
store <8 x half> %tmp0, ptr addrspace(1) %out
ret void
}
; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 false, <16 x bfloat> %A, i1 false, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false)
; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false)
define amdgpu_kernel void @wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
%tmp0 = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false)
%tmp0 = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false)
store <8 x bfloat> %tmp0, ptr addrspace(1) %out
ret void
}

View File

@ -12,3 +12,55 @@ bb:
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-NEXT: bb:
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> %A, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 true)
; CHECK-NEXT: store <8 x float> %res, ptr addrspace(1) %out
; CHECK-NEXT: ret void
; CHECK-NEXT: }
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 true)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
; CHECK-NEXT: bb:
; CHECK-NEXT: %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 true)
; CHECK-NEXT: store <8 x bfloat> %res, ptr addrspace(1) %out
; CHECK-NEXT: ret void
; CHECK-NEXT: }
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 true)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-NEXT: bb:
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 true)
; CHECK-NEXT: store <8 x float> %res, ptr addrspace(1) %out
; CHECK-NEXT: ret void
; CHECK-NEXT: }
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 true)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; CHECK-NEXT: bb:
; CHECK-NEXT: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 true)
; CHECK-NEXT: store <8 x half> %res, ptr addrspace(1) %out
; CHECK-NEXT: ret void
; CHECK-NEXT: }
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 true)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}

View File

@ -61,7 +61,7 @@ body: |
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_add_nc_u32_e32 v13, v13, v8
liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12, $vgpr13
$vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, $vgpr0_vgpr1, 8, $vgpr2_vgpr3, 8, $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, -1, 0, 0, implicit $exec
$vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr $vgpr0_vgpr1, $vgpr2_vgpr3, 8, $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, -1, 0, 0, implicit $exec
$vgpr12 = V_EXP_F32_e32 $vgpr12, implicit $exec, implicit $mode
$vgpr13 = V_ADD_U32_e32 $vgpr13, $vgpr8, implicit $exec
...

View File

@ -21,7 +21,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B,
; GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 true)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> %A, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 true)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
@ -65,7 +65,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bflo
; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 true)
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 true)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
@ -309,7 +309,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B
; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 true)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 true)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
@ -329,7 +329,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B
; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 true)
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 true)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
@ -2956,9 +2956,9 @@ bb:
ret void
}
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float>, <2 x float>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat>, <16 x bfloat>, i16, <8 x float>, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat>, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
@ -2969,8 +2969,8 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>,
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half>, <16 x half>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half>, <16 x half>, i16, <8 x half>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i64, i32, i32, i64, i1, i1)

View File

@ -21,7 +21,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B,
; GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> %A, <2 x float> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
@ -63,7 +63,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_splat(<2 x float> %A, <2 x
; GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> %A, <2 x float> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
@ -107,7 +107,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_inlineable(<2 x float> %A,
; GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> %A, <2 x float> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
@ -221,7 +221,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bflo
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>, i1 false, i1 false)
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
@ -247,7 +247,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_splat(<16 x bfloat> %A,
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 0.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>, i1 false, i1 false)
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 0.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
@ -275,7 +275,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_inlineable(<16 x bfloat>
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> <bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5>, i1 false, i1 false)
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x bfloat> <bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5>, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
@ -1259,7 +1259,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
@ -1301,7 +1301,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_splat(<16 x half> %A, <16
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
@ -1345,7 +1345,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_inlineable(<16 x half> %A,
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
@ -1365,7 +1365,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
@ -1396,7 +1396,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_splat(<16 x half> %A, <16
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
@ -1427,7 +1427,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_inlineable(<16 x half> %A,
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
@ -3008,9 +3008,9 @@ bb:
ret void
}
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float>, <2 x float>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat>, <16 x bfloat>, i16, <8 x float>, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat>, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
@ -3021,8 +3021,8 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>,
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half>, <16 x half>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half>, <16 x half>, i16, <8 x half>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i64, i32, i32, i64, i1, i1)

View File

@ -2,54 +2,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefix=GISEL
define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negA(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negA:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX1250-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negA:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off
; GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 1, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negB(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negB:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX1250-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negB:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off
; GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 1, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negC(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negC:
; GFX1250: ; %bb.0: ; %bb
@ -69,7 +21,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negC(<2 x float> %A, <2 x float
; GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 1, <8 x float> %C, i1 false, i1 false)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> %A, <2 x float> %B, i16 1, <8 x float> %C, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
@ -93,7 +45,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_neg_absC(<2 x float> %A, <2 x f
; GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 3, <8 x float> %C, i1 false, i1 false)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> %A, <2 x float> %B, i16 3, <8 x float> %C, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
@ -117,7 +69,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_ignoreC(<2 x float> %A, <2 x fl
; GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 4, <8 x float> %C, i1 false, i1 false)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> %A, <2 x float> %B, i16 4, <8 x float> %C, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
@ -194,46 +146,6 @@ bb:
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negA:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negA:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 1, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negB:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negB:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 1, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negC:
; GFX1250: ; %bb.0: ; %bb
@ -249,7 +161,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x
; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 1, <8 x bfloat> %C, i1 false, i1 false)
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 1, <8 x bfloat> %C, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
@ -269,7 +181,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <
; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 3, <8 x bfloat> %C, i1 false, i1 false)
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 3, <8 x bfloat> %C, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
@ -289,7 +201,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <1
; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 4, <8 x bfloat> %C, i1 false, i1 false)
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 4, <8 x bfloat> %C, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
@ -970,54 +882,6 @@ bb:
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negA(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negA:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negA:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off
; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 1, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negB(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negB:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negB:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off
; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 1, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negC(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negC:
; GFX1250: ; %bb.0: ; %bb
@ -1037,7 +901,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negC(<16 x half> %A, <16 x hal
; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 1, <8 x float> %C, i1 false, i1 false)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, i16 1, <8 x float> %C, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
@ -1061,7 +925,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_neg_absC(<16 x half> %A, <16 x
; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 3, <8 x float> %C, i1 false, i1 false)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, i16 3, <8 x float> %C, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
@ -1085,51 +949,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_ignoreC(<16 x half> %A, <16 x
; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 4, <8 x float> %C, i1 false, i1 false)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, i16 4, <8 x float> %C, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negA(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negA:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negA:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 1, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negB(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negB:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negB:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 1, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negC(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negC:
; GFX1250: ; %bb.0: ; %bb
@ -1145,7 +969,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negC(<16 x half> %A, <16 x hal
; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 1, <8 x half> %C, i1 false, i1 false)
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, i16 1, <8 x half> %C, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
@ -1165,7 +989,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16_neg_absC(<16 x half> %A, <16 x
; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 3, <8 x half> %C, i1 false, i1 false)
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, i16 3, <8 x half> %C, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
@ -1185,7 +1009,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16_ignoreC(<16 x half> %A, <16 x
; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 4, <8 x half> %C, i1 false, i1 false)
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, i16 4, <8 x half> %C, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
@ -2478,9 +2302,9 @@ bb:
ret void
}
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float>, <2 x float>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat>, <16 x bfloat>, i16, <8 x float>, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat>, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
@ -2491,8 +2315,8 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>,
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half>, <16 x half>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half>, <16 x half>, i16, <8 x half>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i64, i32, i32, i64, i1, i1)

View File

@ -105,17 +105,17 @@ body: |
; CHECK-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_WAIT_DSCNT 12
; CHECK-NEXT: early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 8
; CHECK-NEXT: early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 4
; CHECK-NEXT: early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 0
; CHECK-NEXT: early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_BARRIER
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
@ -167,18 +167,18 @@ body: |
liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
; WMMA using vgpr10-25 (loaded in preheader or previous iteration's prefetch)
early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
; More WMMAs using other registers
early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; Repeat with same registers
early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
; Barrier
S_BARRIER

View File

@ -10,11 +10,11 @@ name: test_wmma_f32_16x16x4_f32_D0_overlaps_A1
body: |
bb.0:
; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_A1
; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr4_vgpr5, 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
$vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
$vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr4_vgpr5, 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr4_vgpr5, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
$vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
$vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr4_vgpr5, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
...
---
@ -22,11 +22,11 @@ name: test_wmma_f32_16x16x4_f32_D0_overlaps_B1
body: |
bb.0:
; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_B1
; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr14_vgpr15, 8, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
$vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
$vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr14_vgpr15, 8, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr14_vgpr15, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
$vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
$vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr14_vgpr15, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
...
---
@ -34,10 +34,10 @@ name: test_wmma_f32_16x16x4_f32_D0_overlaps_Index1
body: |
bb.0:
; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_Index1
; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr4_vgpr5, 0, 0, 0, implicit $exec
$vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
$vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
$vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr4_vgpr5, 0, 0, 0, implicit $exec
...
@ -94,15 +94,15 @@ name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_A1
body: |
bb.0:
; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_A1
; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
$vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
$vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
$vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
$vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
...
---
@ -110,15 +110,15 @@ name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_B1
body: |
bb.0:
; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_B1
; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
$vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
$vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
$vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
$vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
...
---
@ -126,14 +126,14 @@ name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_Index1
body: |
bb.0:
; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_Index1
; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
$vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
$vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
$vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
...
@ -634,15 +634,15 @@ name: test_wmma_f32_16x16x32_f16_D0_overlaps_A1
body: |
bb.0:
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_A1
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
...
---
@ -650,15 +650,15 @@ name: test_wmma_f32_16x16x32_f16_D0_overlaps_B1
body: |
bb.0:
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_B1
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
...
---
@ -666,14 +666,14 @@ name: test_wmma_f32_16x16x32_f16_D0_overlaps_Index1
body: |
bb.0:
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_Index1
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
$vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
...
@ -682,15 +682,15 @@ name: test_wmma_f16_16x16x32_f16_D0_overlaps_A1
body: |
bb.0:
; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_A1
; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
$vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
$vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
$vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
$vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
...
---
@ -698,15 +698,15 @@ name: test_wmma_f16_16x16x32_f16_D0_overlaps_B1
body: |
bb.0:
; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_B1
; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
$vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
$vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
$vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
$vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
...
---
@ -714,14 +714,14 @@ name: test_wmma_f16_16x16x32_f16_D0_overlaps_Index1
body: |
bb.0:
; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_Index1
; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: V_NOP_e32 implicit $exec
; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
$vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
$vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
$vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
...

View File

@ -1392,11 +1392,11 @@ def ROCDL_wmma_f32_16x16x16_bf8_bf8 : ROCDL_WMMA_IntrOp<"wmma.f32.16x16x16.bf8_b
def ROCDL_wmma_f32_16x16x16_bf8_fp8 : ROCDL_WMMA_IntrOp<"wmma.f32.16x16x16.bf8_fp8", AnyInteger, F32>;
def ROCDL_wmma_i32_16x16x32_iu4 : ROCDL_WMMA_IU_IntrOp<"wmma.i32.16x16x32.iu4", AnyInteger, AnyInteger>;
// Available from gfx1250
def ROCDL_wmma_f32_16x16x4_f32 : ROCDL_WMMA_ModsAll_Reuse_IntrOp<"wmma.f32.16x16x4.f32", F32, F32>;
def ROCDL_wmma_f32_16x16x4_f32 : ROCDL_WMMA_ModsC_IntrOp<"wmma.f32.16x16x4.f32", F32, F32>;
def ROCDL_wmma_f32_16x16x32_bf16 : ROCDL_WMMA_ModsC_IntrOp<"wmma.f32.16x16x32.bf16", BF16, F32>;
def ROCDL_wmma_f32_16x16x32_f16 : ROCDL_WMMA_ModsAll_Reuse_IntrOp<"wmma.f32.16x16x32.f16", F16, F32>;
def ROCDL_wmma_f16_16x16x32_f16 : ROCDL_WMMA_ModsAll_Reuse_IntrOp<"wmma.f16.16x16x32.f16", F16, F16>;
def ROCDL_wmma_bf16_16x16x32_bf16 : ROCDL_WMMA_ModsAll_Reuse_IntrOp<"wmma.bf16.16x16x32.bf16", BF16, BF16>;
def ROCDL_wmma_f32_16x16x32_f16 : ROCDL_WMMA_ModsC_IntrOp<"wmma.f32.16x16x32.f16", F16, F32>;
def ROCDL_wmma_f16_16x16x32_f16 : ROCDL_WMMA_ModsC_IntrOp<"wmma.f16.16x16x32.f16", F16, F16>;
def ROCDL_wmma_bf16_16x16x32_bf16 : ROCDL_WMMA_ModsC_IntrOp<"wmma.bf16.16x16x32.bf16", BF16, BF16>;
def ROCDL_wmma_bf16f32_16x16x32_bf16 : ROCDL_WMMA_ModsAll_Diff_IntrOp<"wmma.bf16f32.16x16x32.bf16", BF16, /*Type C=*/F32, /*Type D=*/BF16>;
def ROCDL_wmma_f32_16x16x64_fp8_fp8 : ROCDL_WMMA_ModsC_IntrOp<"wmma.f32.16x16x64.fp8_fp8", AnyInteger, F32>;
def ROCDL_wmma_f32_16x16x64_fp8_bf8 : ROCDL_WMMA_ModsC_IntrOp<"wmma.f32.16x16x64.fp8_bf8", AnyInteger, F32>;

View File

@ -1028,11 +1028,11 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v
%r6c = rocdl.wmma.i32.16x16x32.iu4 %arg4, %arg4, %arg3 {signA = true, signB = true, clamp = false} : (vector<2xi32>, vector<2xi32>, vector<8xi32>) -> vector<8xi32>
// f32 -> f32
// CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v4f32.v16f32(i1 false, <16 x float> %{{.*}} i1 false, <16 x float> %{{.*}} i16 0, <4 x float> %{{.*}} i1 false, i1 false)
// CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v4f32.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}} i16 0, <4 x float> %{{.*}} i1 false, i1 false)
%r1.gfx1250 = rocdl.wmma.f32.16x16x4.f32 %arg10, %arg10, %arg11 {signA = false, signB = false, modC = 0 : i16} : (vector<16xf32>, vector<16xf32>, vector<4xf32>) -> vector<4xf32>
// f16 -> f32
// CHECK: call <32 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v32f32.v16f16(i1 false, <16 x half> %{{.*}} i1 false, <16 x half> %{{.*}} i16 0, <32 x float> %{{.*}} i1 false, i1 false)
// CHECK: call <32 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v32f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}} i16 0, <32 x float> %{{.*}} i1 false, i1 false)
%r2.gfx1250 = rocdl.wmma.f32.16x16x32.f16 %arg1, %arg1, %arg12 {signA = false, signB = false, modC = 0 : i16} : (vector<16xf16>, vector<16xf16>, vector<32xf32>) -> vector<32xf32>
// bf16 -> f32
@ -1040,11 +1040,11 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v
%r3.gfx1250 = rocdl.wmma.f32.16x16x32.bf16 %arg16, %arg16, %arg12 {signA = false, signB = false, modC = 0 : i16} : (vector<16xbf16>, vector<16xbf16>, vector<32xf32>) -> vector<32xf32>
// f16 -> f16
// CHECK: call <32 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v32f16.v16f16(i1 false, <16 x half> %{{.*}} i1 false, <16 x half> %{{.*}} i16 0, <32 x half> %{{.*}} i1 false, i1 false)
// CHECK: call <32 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v32f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}} i16 0, <32 x half> %{{.*}} i1 false, i1 false)
%r4.gfx1250 = rocdl.wmma.f16.16x16x32.f16 %arg1, %arg1, %arg9 {signA = false, signB = false, modC = 0 : i16} : (vector<16xf16>, vector<16xf16>, vector<32xf16>) -> vector<32xf16>
// bf16 -> bf16
// CHECK: call <32 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v32bf16.v16bf16(i1 false, <16 x bfloat> %{{.*}} i1 false, <16 x bfloat> %{{.*}} i16 0, <32 x bfloat> %{{.*}} i1 false, i1 false)
// CHECK: call <32 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v32bf16.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} i16 0, <32 x bfloat> %{{.*}} i1 false, i1 false)
%r5.gfx1250 = rocdl.wmma.bf16.16x16x32.bf16 %arg16, %arg16, %arg17 {signA = false, signB = false, modC = 0 : i16} : (vector<16xbf16>, vector<16xbf16>, vector<32xbf16>) -> vector<32xbf16>
// bf16 -> bf16 / f32
@ -1112,12 +1112,12 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v
// CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 true, <4 x i32> %{{.*}} i1 false, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 true, i1 true, i1 false)
%r23b.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = true, signB = false, reuseA = true, reuseB = true, clamp=false} : (vector<4xi32>, vector<4xi32>, vector<64xi32>) -> vector<64xi32>
// Test signA=true, signB=true with modC=1 for f32 gfx1250
// CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v4f32.v16f32(i1 true, <16 x float> %{{.*}} i1 true, <16 x float> %{{.*}} i16 1, <4 x float> %{{.*}} i1 false, i1 false)
%r1a.gfx1250 = rocdl.wmma.f32.16x16x4.f32 %arg10, %arg10, %arg11 {signA = true, signB = true, modC = 1 : i16, reuseA = false, reuseB = false} : (vector<16xf32>, vector<16xf32>, vector<4xf32>) -> vector<4xf32>
// Test signA=false, signB=false with modC=1 for f32 gfx1250
// CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v4f32.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}} i16 1, <4 x float> %{{.*}} i1 false, i1 false)
%r1a.gfx1250 = rocdl.wmma.f32.16x16x4.f32 %arg10, %arg10, %arg11 {signA = false, signB = false, modC = 1 : i16, reuseA = false, reuseB = false} : (vector<16xf32>, vector<16xf32>, vector<4xf32>) -> vector<4xf32>
// Test with modC=2 and signA=false, signB=true, reuseA=true for f16 gfx1250
// CHECK: call <32 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v32f32.v16f16(i1 false, <16 x half> %{{.*}} i1 true, <16 x half> %{{.*}} i16 2, <32 x float> %{{.*}} i1 true, i1 false)
// CHECK: call <32 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v32f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}} i16 2, <32 x float> %{{.*}} i1 true, i1 false)
%r2a.gfx1250 = rocdl.wmma.f32.16x16x32.f16 %arg1, %arg1, %arg12 {signA = false, signB = true, modC = 2 : i16, reuseA = true, reuseB = false} : (vector<16xf16>, vector<16xf16>, vector<32xf32>) -> vector<32xf32>
// Test with modC=3 and signA=false, signB=false, reuseB=true for bf16 gfx1250