[AMDGPU] Disable neg_lo[0:1] and neg_hi[0:1] on wmma_f32_16x16x32_bf16 (#188649)
This is the pilot change, the rest will follow the same idea.
This commit is contained in:
parent
a0181d0507
commit
e69c7312f3
@ -1463,6 +1463,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
||||
unsigned BuiltinWMMAOp;
|
||||
// Need return type when D and C are of different types.
|
||||
bool NeedReturnType = false;
|
||||
// Need to remove unused neg modifiers.
|
||||
bool RemoveABNeg = false;
|
||||
|
||||
switch (BuiltinID) {
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
|
||||
@ -1607,8 +1609,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
|
||||
ArgsForMatchingMatrixTypes = {5, 1};
|
||||
ArgsForMatchingMatrixTypes = {3, 0};
|
||||
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16;
|
||||
RemoveABNeg = true;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
|
||||
ArgsForMatchingMatrixTypes = {5, 1};
|
||||
@ -1778,8 +1781,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
||||
}
|
||||
|
||||
SmallVector<Value *, 6> Args;
|
||||
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
|
||||
for (int i = 0, e = E->getNumArgs(); i != e; ++i) {
|
||||
// Remove unused neg modifiers.
|
||||
if (RemoveABNeg && (i == 0 || i == 2))
|
||||
continue;
|
||||
Args.push_back(EmitScalarExpr(E->getArg(i)));
|
||||
}
|
||||
if (AppendFalseForOpselArg)
|
||||
Args.push_back(Builder.getFalse());
|
||||
|
||||
|
||||
@ -366,6 +366,11 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
|
||||
}
|
||||
return false;
|
||||
}
|
||||
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
|
||||
return SemaRef.BuiltinConstantArgRange(TheCall, /*ArgNum=*/0, /*Low=*/0,
|
||||
/*High=*/0) ||
|
||||
SemaRef.BuiltinConstantArgRange(TheCall, /*ArgNum=*/2, /*Low=*/0,
|
||||
/*High=*/0);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -18,7 +18,7 @@ typedef int v2i __attribute__((ext_vector_type(2)));
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x4_f32(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 false, <2 x float> [[A:%.*]], i1 false, <2 x float> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8:![0-9]+]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7:![0-9]+]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x4_f32(global v8f* out, v2f a, v2f b, v8f c)
|
||||
@ -28,8 +28,8 @@ void test_amdgcn_wmma_f32_16x16x4_f32(global v8f* out, v2f a, v2f b, v8f c)
|
||||
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_bf16(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 true, i1 false)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 true, i1 false)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x32_bf16(global v8f* out, v16bf16 a, v16bf16 b, v8f c)
|
||||
@ -40,7 +40,7 @@ void test_amdgcn_wmma_f32_16x16x32_bf16(global v8f* out, v16bf16 a, v16bf16 b, v
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_bf16_16x16x32_bf16(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x bfloat> [[C:%.*]], i1 false, i1 false)
|
||||
// CHECK-GFX1250-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_bf16_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16 b, v8bf16 c)
|
||||
@ -51,7 +51,7 @@ void test_amdgcn_wmma_bf16_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_bf16f32_16x16x32_bf16(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16.v8f32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_bf16f32_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16 b, v8f c)
|
||||
@ -62,7 +62,7 @@ void test_amdgcn_wmma_bf16f32_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_fp8_fp8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 true, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x64_fp8_fp8(global v8f* out, v8i a, v8i b, v8f c)
|
||||
@ -73,7 +73,7 @@ void test_amdgcn_wmma_f32_16x16x64_fp8_fp8(global v8f* out, v8i a, v8i b, v8f c)
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_fp8_bf8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x64_fp8_bf8(global v8f* out, v8i a, v8i b, v8f c)
|
||||
@ -84,7 +84,7 @@ void test_amdgcn_wmma_f32_16x16x64_fp8_bf8(global v8f* out, v8i a, v8i b, v8f c)
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_bf8_fp8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x64_bf8_fp8(global v8f* out, v8i a, v8i b, v8f c)
|
||||
@ -95,7 +95,7 @@ void test_amdgcn_wmma_f32_16x16x64_bf8_fp8(global v8f* out, v8i a, v8i b, v8f c)
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_bf8_bf8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x64_bf8_bf8(global v8f* out, v8i a, v8i b, v8f c)
|
||||
@ -106,7 +106,7 @@ void test_amdgcn_wmma_f32_16x16x64_bf8_bf8(global v8f* out, v8i a, v8i b, v8f c)
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_fp8_fp8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x64_fp8_fp8(global v8h* out, v8i a, v8i b, v8h c)
|
||||
@ -117,7 +117,7 @@ void test_amdgcn_wmma_f16_16x16x64_fp8_fp8(global v8h* out, v8i a, v8i b, v8h c)
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_fp8_bf8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x64_fp8_bf8(global v8h* out, v8i a, v8i b, v8h c)
|
||||
@ -128,7 +128,7 @@ void test_amdgcn_wmma_f16_16x16x64_fp8_bf8(global v8h* out, v8i a, v8i b, v8h c)
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_bf8_fp8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x64_bf8_fp8(global v8h* out, v8i a, v8i b, v8h c)
|
||||
@ -139,7 +139,7 @@ void test_amdgcn_wmma_f16_16x16x64_bf8_fp8(global v8h* out, v8i a, v8i b, v8h c)
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_bf8_bf8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x64_bf8_bf8(global v8h* out, v8i a, v8i b, v8h c)
|
||||
@ -150,7 +150,7 @@ void test_amdgcn_wmma_f16_16x16x64_bf8_bf8(global v8h* out, v8i a, v8i b, v8h c)
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_i32_16x16x64_iu8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 true, i1 false)
|
||||
// CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c)
|
||||
@ -161,7 +161,7 @@ void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c)
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_i32_16x16x64_iu8_clamp(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 true, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_i32_16x16x64_iu8_clamp(global v8i* out, v8i a, v8i b, v8i c)
|
||||
@ -173,7 +173,7 @@ void test_amdgcn_wmma_i32_16x16x64_iu8_clamp(global v8i* out, v8i a, v8i b, v8i
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
|
||||
// CHECK-GFX1250-NEXT: [[TMP1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A:%.*]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C:%.*]])
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP1]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP1]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c)
|
||||
@ -185,7 +185,7 @@ void test_amdgcn_wmma_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
|
||||
// CHECK-GFX1250-NEXT: [[TMP1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A:%.*]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C:%.*]], i32 1, i32 2, i32 [[SCALE_SRC0:%.*]], i32 2, i32 1, i32 [[SCALE_SRC1:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP1]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP1]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c, int scale_src0, int scale_src1)
|
||||
@ -197,7 +197,7 @@ void test_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
|
||||
// CHECK-GFX1250-NEXT: [[TMP1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A:%.*]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C:%.*]], i32 1, i32 2, i64 [[SCALE_SRC0:%.*]], i32 2, i32 1, i64 [[SCALE_SRC1:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP1]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP1]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c, long scale_src0, long scale_src1)
|
||||
@ -208,7 +208,7 @@ void test_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_f16(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> [[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c)
|
||||
@ -219,7 +219,7 @@ void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c)
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x32_f16(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 false, <16 x half> [[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x32_f16(global v8h* out, v16h a, v16h b, v8h c)
|
||||
@ -230,7 +230,7 @@ void test_amdgcn_wmma_f16_16x16x32_f16(global v8h* out, v16h a, v16h b, v8h c)
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_fp8_fp8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x128_fp8_fp8(global v8h* out, v16i a, v16i b, v8h c)
|
||||
@ -241,7 +241,7 @@ void test_amdgcn_wmma_f16_16x16x128_fp8_fp8(global v8h* out, v16i a, v16i b, v8h
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_fp8_bf8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x128_fp8_bf8(global v8h* out, v16i a, v16i b, v8h c)
|
||||
@ -252,7 +252,7 @@ void test_amdgcn_wmma_f16_16x16x128_fp8_bf8(global v8h* out, v16i a, v16i b, v8h
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_bf8_fp8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x128_bf8_fp8(global v8h* out, v16i a, v16i b, v8h c)
|
||||
@ -263,7 +263,7 @@ void test_amdgcn_wmma_f16_16x16x128_bf8_fp8(global v8h* out, v16i a, v16i b, v8h
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_bf8_bf8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f16_16x16x128_bf8_bf8(global v8h* out, v16i a, v16i b, v8h c)
|
||||
@ -274,7 +274,7 @@ void test_amdgcn_wmma_f16_16x16x128_bf8_bf8(global v8h* out, v16i a, v16i b, v8h
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_fp8_fp8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 true, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x128_fp8_fp8(global v8f* out, v16i a, v16i b, v8f c)
|
||||
@ -285,7 +285,7 @@ void test_amdgcn_wmma_f32_16x16x128_fp8_fp8(global v8f* out, v16i a, v16i b, v8f
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_fp8_bf8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x128_fp8_bf8(global v8f* out, v16i a, v16i b, v8f c)
|
||||
@ -296,7 +296,7 @@ void test_amdgcn_wmma_f32_16x16x128_fp8_bf8(global v8f* out, v16i a, v16i b, v8f
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_bf8_fp8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x128_bf8_fp8(global v8f* out, v16i a, v16i b, v8f c)
|
||||
@ -307,7 +307,7 @@ void test_amdgcn_wmma_f32_16x16x128_bf8_fp8(global v8f* out, v16i a, v16i b, v8f
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_bf8_bf8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_16x16x128_bf8_bf8(global v8f* out, v16i a, v16i b, v8f c)
|
||||
@ -318,7 +318,7 @@ void test_amdgcn_wmma_f32_16x16x128_bf8_bf8(global v8f* out, v16i a, v16i b, v8f
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]])
|
||||
// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c)
|
||||
@ -329,7 +329,7 @@ void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b,
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale_f32_32x16x128_f4(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]], i32 1, i32 2, i32 [[SCALE_SRC0:%.*]], i32 2, i32 1, i32 [[SCALE_SRC1:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_scale_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int scale_src0, int scale_src1)
|
||||
@ -340,7 +340,7 @@ void test_amdgcn_wmma_scale_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v1
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale16_f32_32x16x128_f4(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]], i32 1, i32 2, i64 [[SCALE_SRC0:%.*]], i32 2, i32 1, i64 [[SCALE_SRC1:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_wmma_scale16_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, long scale_src0, long scale_src1)
|
||||
@ -351,7 +351,7 @@ void test_amdgcn_wmma_scale16_f32_32x16x128_f4(global v16f* out, v16i a, v8i b,
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_bf16(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index)
|
||||
@ -362,7 +362,7 @@ void test_amdgcn_swmmac_f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b,
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_bf16_16x16x64_bf16(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x bfloat> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_bf16_16x16x64_bf16(global v8bf16* out, v16bf16 a, v32bf16 b, v8bf16 c, int index)
|
||||
@ -373,7 +373,7 @@ void test_amdgcn_swmmac_bf16_16x16x64_bf16(global v8bf16* out, v16bf16 a, v32bf1
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_bf16f32_16x16x64_bf16(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_bf16f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index)
|
||||
@ -384,7 +384,7 @@ void test_amdgcn_swmmac_bf16f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf1
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_fp8_fp8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.v2i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], <2 x i32> [[INDEX:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x128_fp8_fp8(global v8f* out, v8i a, v16i b, v8f c, v2i index)
|
||||
@ -395,7 +395,7 @@ void test_amdgcn_swmmac_f32_16x16x128_fp8_fp8(global v8f* out, v8i a, v16i b, v8
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_fp8_bf8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.v2i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], <2 x i32> [[INDEX:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x128_fp8_bf8(global v8f* out, v8i a, v16i b, v8f c, v2i index)
|
||||
@ -406,7 +406,7 @@ void test_amdgcn_swmmac_f32_16x16x128_fp8_bf8(global v8f* out, v8i a, v16i b, v8
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_bf8_fp8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.v2i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], <2 x i32> [[INDEX:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x128_bf8_fp8(global v8f* out, v8i a, v16i b, v8f c, v2i index)
|
||||
@ -417,7 +417,7 @@ void test_amdgcn_swmmac_f32_16x16x128_bf8_fp8(global v8f* out, v8i a, v16i b, v8
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_bf8_bf8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.v2i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], <2 x i32> [[INDEX:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x128_bf8_bf8(global v8f* out, v8i a, v16i b, v8f c, v2i index)
|
||||
@ -428,7 +428,7 @@ void test_amdgcn_swmmac_f32_16x16x128_bf8_bf8(global v8f* out, v8i a, v16i b, v8
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_fp8_fp8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.v2i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], <2 x i32> [[INDEX:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f16_16x16x128_fp8_fp8(global v8h* out, v8i a, v16i b, v8h c, v2i index)
|
||||
@ -439,7 +439,7 @@ void test_amdgcn_swmmac_f16_16x16x128_fp8_fp8(global v8h* out, v8i a, v16i b, v8
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_fp8_bf8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.v2i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], <2 x i32> [[INDEX:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f16_16x16x128_fp8_bf8(global v8h* out, v8i a, v16i b, v8h c, v2i index)
|
||||
@ -450,7 +450,7 @@ void test_amdgcn_swmmac_f16_16x16x128_fp8_bf8(global v8h* out, v8i a, v16i b, v8
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_bf8_fp8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.v2i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], <2 x i32> [[INDEX:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f16_16x16x128_bf8_fp8(global v8h* out, v8i a, v16i b, v8h c, v2i index)
|
||||
@ -461,7 +461,7 @@ void test_amdgcn_swmmac_f16_16x16x128_bf8_fp8(global v8h* out, v8i a, v16i b, v8
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_bf8_bf8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.v2i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], <2 x i32> [[INDEX:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f16_16x16x128_bf8_bf8(global v8h* out, v8i a, v16i b, v8h c, v2i index)
|
||||
@ -472,7 +472,7 @@ void test_amdgcn_swmmac_f16_16x16x128_bf8_bf8(global v8h* out, v8i a, v16i b, v8
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_i32_16x16x128_iu8(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.v2i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <16 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], <2 x i32> [[INDEX:%.*]], i1 false, i1 true, i1 false)
|
||||
// CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x128_iu8(global v8i* out, v8i a, v16i b, v8i c, v2i index)
|
||||
@ -483,7 +483,7 @@ void test_amdgcn_swmmac_i32_16x16x128_iu8(global v8i* out, v8i a, v16i b, v8i c,
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_i32_16x16x128_iu8_clamp(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.v2i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <16 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], <2 x i32> [[INDEX:%.*]], i1 false, i1 true, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_i32_16x16x128_iu8_clamp(global v8i* out, v8i a, v16i b, v8i c, v2i index)
|
||||
@ -494,7 +494,7 @@ void test_amdgcn_swmmac_i32_16x16x128_iu8_clamp(global v8i* out, v8i a, v16i b,
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_f16(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i32(i1 false, <16 x half> [[A:%.*]], i1 false, <32 x half> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f32_16x16x64_f16(global v8f* out, v16h a, v32h b, v8f c, int index)
|
||||
@ -505,7 +505,7 @@ void test_amdgcn_swmmac_f32_16x16x64_f16(global v8f* out, v16h a, v32h b, v8f c,
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x64_f16(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i32(i1 false, <16 x half> [[A:%.*]], i1 false, <32 x half> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA8]]
|
||||
// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA7]]
|
||||
// CHECK-GFX1250-NEXT: ret void
|
||||
//
|
||||
void test_amdgcn_swmmac_f16_16x16x64_f16(global v8h* out, v16h a, v32h b, v8h c, int index)
|
||||
|
||||
@ -30,6 +30,8 @@ void test_amdgcn_wmma_f32_16x16x32_bf16(global v8f* out, v16bf16 a, v16bf16 b, v
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(1, a, 0, b, 0, c, false, false); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
|
||||
*out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 1, b, 0, c, false, false); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
|
||||
}
|
||||
|
||||
void test_amdgcn_wmma_bf16_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16 b, v8bf16 c, int mod)
|
||||
|
||||
@ -4112,7 +4112,7 @@ class AMDGPUWmmaScaleF4IntrinsicModsC<LLVMType scale_ty> :
|
||||
|
||||
defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
|
||||
def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsC<llvm_anyfloat_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_f32_16x16x32_f16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_f16_16x16x32_f16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
|
||||
def int_amdgcn_wmma_bf16_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
|
||||
|
||||
@ -1307,16 +1307,23 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
|
||||
break; // No other 'amdgcn.atomic.*'
|
||||
}
|
||||
|
||||
switch (F->getIntrinsicID()) {
|
||||
default:
|
||||
break;
|
||||
// Legacy wmma iu intrinsics without the optional clamp operand.
|
||||
if (F->getIntrinsicID() == Intrinsic::amdgcn_wmma_i32_16x16x64_iu8 &&
|
||||
F->arg_size() == 7) {
|
||||
NewFn = nullptr;
|
||||
return true;
|
||||
}
|
||||
if (F->getIntrinsicID() == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8 &&
|
||||
F->arg_size() == 8) {
|
||||
NewFn = nullptr;
|
||||
return true;
|
||||
case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
|
||||
if (F->arg_size() == 7) {
|
||||
NewFn = nullptr;
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
|
||||
case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16:
|
||||
if (F->arg_size() == 8) {
|
||||
NewFn = nullptr;
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (Name.consume_front("ds.") || Name.consume_front("global.atomic.") ||
|
||||
@ -4715,6 +4722,41 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
|
||||
return UpgradeLegacyWMMAIUIntrinsicCall(F, CI, Builder, {T1, T2, T3, T4});
|
||||
}
|
||||
|
||||
switch (F->getIntrinsicID()) {
|
||||
default:
|
||||
break;
|
||||
case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16: {
|
||||
// Drop src0 and src1 modifiers.
|
||||
const Value *Op0 = CI->getArgOperand(0);
|
||||
const Value *Op2 = CI->getArgOperand(2);
|
||||
assert(Op0->getType()->isIntegerTy() && Op2->getType()->isIntegerTy());
|
||||
const ConstantInt *ModA = dyn_cast<ConstantInt>(Op0);
|
||||
const ConstantInt *ModB = dyn_cast<ConstantInt>(Op2);
|
||||
if (!ModA->isZero() || !ModB->isZero())
|
||||
reportFatalUsageError(Name + " matrix A and B modifiers shall be zero");
|
||||
|
||||
SmallVector<Value *, 8> Args{CI->getArgOperand(1), CI->getArgOperand(3)};
|
||||
for (int I = 4, E = CI->arg_size(); I < E; ++I)
|
||||
Args.push_back(CI->getArgOperand(I));
|
||||
|
||||
Function *NewDecl = Intrinsic::getOrInsertDeclaration(
|
||||
F->getParent(), F->getIntrinsicID(),
|
||||
{F->getReturnType(), Args[0]->getType()});
|
||||
|
||||
SmallVector<OperandBundleDef, 1> Bundles;
|
||||
CI->getOperandBundlesAsDefs(Bundles);
|
||||
|
||||
auto *NewCall = cast<CallInst>(Builder.CreateCall(NewDecl, Args, Bundles));
|
||||
NewCall->setTailCallKind(cast<CallInst>(CI)->getTailCallKind());
|
||||
NewCall->setCallingConv(CI->getCallingConv());
|
||||
NewCall->setAttributes(CI->getAttributes());
|
||||
NewCall->setDebugLoc(CI->getDebugLoc());
|
||||
NewCall->copyMetadata(*CI);
|
||||
NewCall->takeName(CI);
|
||||
return NewCall;
|
||||
}
|
||||
}
|
||||
|
||||
AtomicRMWInst::BinOp RMWOp =
|
||||
StringSwitch<AtomicRMWInst::BinOp>(Name)
|
||||
.StartsWith("ds.fadd", AtomicRMWInst::FAdd)
|
||||
|
||||
@ -1503,10 +1503,11 @@ let WaveSizePredicate = isWave64 in {
|
||||
class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
bit _IsIU, bit _IsFP8BF8, bit _Has_ImodOp = 0,
|
||||
bit _HasMatrixFMT = 0, bit _HasMatrixScale = 0,
|
||||
bit _Scale16 = 0, bit _HasMatrixReuse = 0, bit _IsF4 = 0>
|
||||
bit _Scale16 = 0, bit _HasMatrixReuse = 0, bit _IsF4 = 0,
|
||||
bit _NoABMods = 0>
|
||||
: VOP3P_Profile<VOPProfile<ArgTy>> {
|
||||
bit IsIU = _IsIU;
|
||||
bit NoABMods = !or(_IsFP8BF8, _IsF4); // No IMOD support for A and B
|
||||
bit NoABMods = !or(_IsFP8BF8, _IsF4, _NoABMods); // No IMOD support for A and B
|
||||
|
||||
int IndexType = _IndexType;
|
||||
let HasMatrixFMT = _HasMatrixFMT;
|
||||
@ -1526,6 +1527,9 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
bit IsAB_BF16 = !or(!eq(ArgTy[1], v16i16), !eq(ArgTy[1], v8i16), !eq(ArgTy[1], v4i16),
|
||||
!eq(ArgTy[1], v16bf16), !eq(ArgTy[1], v8bf16), !eq(ArgTy[1], v4bf16));
|
||||
bit IsF16BF16 = !or(IsAB_F16, IsAB_BF16);
|
||||
bit IsAB_F4F6F8 = !and(!not(IsIU), !or(!eq(ArgTy[1], v8i32), !eq(ArgTy[1], v16i32),
|
||||
!eq(ArgTy[1], v24i32), !eq(ArgTy[1], v32i32),
|
||||
!eq(ArgTy[1], v48i32), !eq(ArgTy[1], v64i32)));
|
||||
|
||||
bit IsC_F64 = !eq(ArgTy[3], v8f64);
|
||||
bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32));
|
||||
@ -1533,12 +1537,18 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
!eq(ArgTy[3], v8bf16), !eq(ArgTy[3], v4bf16));
|
||||
bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16));
|
||||
|
||||
bit NegLo01 = !not(NoABMods);
|
||||
bit NegLo0 = !not(NoABMods);
|
||||
bit NegLo1 = !not(NoABMods);
|
||||
bit NegLo2 = !and(!not(IsIU), IsWMMA);
|
||||
bit NegHi01 = IsF16BF16; // Only F16BF16 can have neg_hi[0:1]
|
||||
bit NegHi0 = !and(IsF16BF16, !not(NoABMods)); // Only F16BF16 can have neg_hi[0]
|
||||
bit NegHi1 = !and(IsF16BF16, !not(NoABMods)); // Only F16BF16 can have neg_hi[1]
|
||||
bit NegHi2 = !and(!not(IsIU), IsWMMA);
|
||||
bit NegLoAny = !or(NegLo01, NegLo2);
|
||||
bit NegHiAny = !or(NegHi01, NegHi2);
|
||||
bit NegLoAny = !or(NegLo0, NegLo1, NegLo2);
|
||||
bit NegHiAny = !or(NegHi0, NegHi1, NegHi2);
|
||||
// HasSrc0Mods/HasSrc1Mods: true if that source has any modifier bits.
|
||||
bit HasSrc0Mods = !or(NegLo0, NegHi0);
|
||||
bit HasSrc1Mods = !or(NegLo1, NegHi1);
|
||||
bit HasSrc2Mods = !and(!not(IsIU), !not(IsSWMMAC));
|
||||
|
||||
let DstRC = !cast<RegisterOperand>("VDst_"#ArgTy[0].Size);
|
||||
let Src0RC64 = !cast<RegisterOperand>("VRegSrc_"#ArgTy[1].Size);
|
||||
@ -1601,9 +1611,10 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
// use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers,
|
||||
// remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32
|
||||
// f16 or bf16). swmmac use index_key and don't use src 2 modifiers.
|
||||
dag Src0Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src0_modifiers));
|
||||
dag Src1Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src1_modifiers));
|
||||
dag Src2Mods = !if(!or(IsIU, IsSWMMAC), (ins), (ins PackedF16InputMods:$src2_modifiers));
|
||||
// srcN_modifiers is only included if that source has neg_lo or neg_hi bits.
|
||||
dag Src0Mods = !if(HasSrc0Mods, (ins PackedF16InputMods:$src0_modifiers), (ins));
|
||||
dag Src1Mods = !if(HasSrc1Mods, (ins PackedF16InputMods:$src1_modifiers), (ins));
|
||||
dag Src2Mods = !if(HasSrc2Mods, (ins PackedF16InputMods:$src2_modifiers), (ins));
|
||||
dag IndexKey = !cond(!eq(IndexType, 0) : (ins),
|
||||
!eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
|
||||
!eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit),
|
||||
@ -1652,34 +1663,44 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
|
||||
// isel patterns
|
||||
bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp));
|
||||
bit IsAB_F16_IMod0 = !and(IsAB_F16, !not(HasIModOp));
|
||||
bit IsAB_F32F64_IMod1 = !and(!or(IsAB_F64, IsAB_F32), HasIModOp);
|
||||
bit IsAB_F16BF16_IMod1 = !and(!or(IsAB_F16, IsAB_BF16), HasIModOp);
|
||||
dag Src0InPat = !cond(IsAB_F32F64_IMod1 : (ins timm:$src0_modifiers, Src0VT:$src0),
|
||||
IsAB_F16BF16_IMod1 : (ins timm:$src0_modifiers, Src0VT:$src0),
|
||||
bit IsA_F32F64_IMod1 = !and(!or(IsAB_F64, IsAB_F32), HasIModOp, HasSrc0Mods);
|
||||
bit IsB_F32F64_IMod1 = !and(!or(IsAB_F64, IsAB_F32), HasIModOp, HasSrc1Mods);
|
||||
bit IsA_F16BF16_IMod1 = !and(!or(IsAB_F16, IsAB_BF16), HasIModOp, HasSrc0Mods);
|
||||
bit IsB_F16BF16_IMod1 = !and(!or(IsAB_F16, IsAB_BF16), HasIModOp, HasSrc1Mods);
|
||||
bit IsA_F4F6F8_IMod1 = !and(IsAB_F4F6F8, HasIModOp, HasSrc0Mods);
|
||||
bit IsB_F4F6F8_IMod1 = !and(IsAB_F4F6F8, HasIModOp, HasSrc1Mods);
|
||||
dag Src0InPat = !cond(IsA_F32F64_IMod1 : (ins timm:$src0_modifiers, Src0VT:$src0),
|
||||
IsA_F16BF16_IMod1 : (ins timm:$src0_modifiers, Src0VT:$src0),
|
||||
IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
|
||||
IsAB_BF16_IMod0 : (ins Src0VT:$src0),
|
||||
IsIU : (ins timm:$src0_modifiers, Src0VT:$src0),
|
||||
HasMatrixFMT : (ins timm:$matrix_a_fmt, Src0VT:$src0),
|
||||
NoABMods : (ins Src0VT:$src0));
|
||||
dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg $src0_modifiers), Src0VT:$src0),
|
||||
IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs $src0_modifiers), Src0VT:$src0),
|
||||
dag Src0OutPat = !cond(IsA_F32F64_IMod1 : (ins (VOP3PModsNeg $src0_modifiers), Src0VT:$src0),
|
||||
IsA_F16BF16_IMod1 : (ins (VOP3PModsNegs $src0_modifiers), Src0VT:$src0),
|
||||
IsAB_F16_IMod0 : (ins i32:$src0_modifiers, Src0VT:$src0),
|
||||
IsAB_BF16_IMod0 : (ins (i32 8), Src0VT:$src0),
|
||||
IsIU : (ins (VOP3PModsNeg $src0_modifiers), Src0VT:$src0),
|
||||
NoABMods : (ins Src0VT:$src0));
|
||||
dag Src1InPat = !cond(IsAB_F32F64_IMod1 : (ins timm:$src1_modifiers, Src1VT:$src1),
|
||||
IsAB_F16BF16_IMod1 : (ins timm:$src1_modifiers, Src1VT:$src1),
|
||||
NoABMods : (ins Src0VT:$src0),
|
||||
IsA_F4F6F8_IMod1 : (ins (VOP3PModsNegs $src0_modifiers), Src0VT:$src0),
|
||||
1 : (ins Src0VT:$src0));
|
||||
dag Src1InPat = !cond(IsB_F32F64_IMod1 : (ins timm:$src1_modifiers, Src1VT:$src1),
|
||||
IsB_F16BF16_IMod1 : (ins timm:$src1_modifiers, Src1VT:$src1),
|
||||
IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
|
||||
IsAB_BF16_IMod0 : (ins Src1VT:$src1),
|
||||
IsIU : (ins timm:$src1_modifiers, Src1VT:$src1),
|
||||
HasMatrixFMT : (ins timm:$matrix_b_fmt, Src1VT:$src1),
|
||||
NoABMods : (ins Src1VT:$src1));
|
||||
dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg $src1_modifiers), Src1VT:$src1),
|
||||
IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs $src1_modifiers), Src1VT:$src1),
|
||||
NoABMods : (ins Src1VT:$src1),
|
||||
IsB_F4F6F8_IMod1 : (ins timm:$src1_modifiers, Src1VT:$src1),
|
||||
1 : (ins Src1VT:$src1));
|
||||
dag Src1OutPat = !cond(IsB_F32F64_IMod1 : (ins (VOP3PModsNeg $src1_modifiers), Src1VT:$src1),
|
||||
IsB_F16BF16_IMod1 : (ins (VOP3PModsNegs $src1_modifiers), Src1VT:$src1),
|
||||
IsAB_F16_IMod0 : (ins i32:$src1_modifiers, Src1VT:$src1),
|
||||
IsAB_BF16_IMod0 : (ins (i32 8), Src1VT:$src1),
|
||||
IsIU : (ins (VOP3PModsNeg $src1_modifiers), Src1VT:$src1),
|
||||
NoABMods : (ins Src1VT:$src1));
|
||||
NoABMods : (ins Src1VT:$src1),
|
||||
IsB_F4F6F8_IMod1 : (ins (VOP3PModsNegs $src0_modifiers), Src1VT:$src1),
|
||||
1 : (ins Src1VT:$src1));
|
||||
bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU));
|
||||
bit IsC_F32_IMod0 = !and(IsC_F32, !not(HasIModOp));
|
||||
bit IsC_F16_IMod0 = !and(IsC_F16, !not(HasIModOp));
|
||||
@ -1844,7 +1865,8 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], /*_I
|
||||
def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
|
||||
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
|
||||
def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
|
||||
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
|
||||
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1, /*_IsF4*/0,
|
||||
/*_NoABMods*/1>;
|
||||
def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
|
||||
/*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
|
||||
def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
|
||||
@ -2237,12 +2259,12 @@ class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
|
||||
let Inst{14} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{2},
|
||||
!if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1));
|
||||
// neg_lo
|
||||
let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
|
||||
let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
|
||||
let Inst{61} = !if(WMMAP.NegLo0, src0_modifiers{0}, 0);
|
||||
let Inst{62} = !if(WMMAP.NegLo1, src1_modifiers{0}, 0);
|
||||
let Inst{63} = !if(WMMAP.NegLo2, src2_modifiers{0}, 0);
|
||||
// neg_hi
|
||||
let Inst{8} = !if(WMMAP.NegHi01, src0_modifiers{1}, 0);
|
||||
let Inst{9} = !if(WMMAP.NegHi01, src1_modifiers{1}, 0);
|
||||
let Inst{8} = !if(WMMAP.NegHi0, src0_modifiers{1}, 0);
|
||||
let Inst{9} = !if(WMMAP.NegHi1, src1_modifiers{1}, 0);
|
||||
let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0);
|
||||
// clamp
|
||||
let Inst{15} = !if(WMMAP.HasClamp, clamp{0}, 0);
|
||||
@ -2338,8 +2360,8 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
|
||||
// The high half of the encoding is the unscaled wmma op.
|
||||
let Inst{71-64} = vdst;
|
||||
|
||||
let Inst{72} = !if(P.NegHi01, src0_modifiers{1}, 0); // neg_hi src0
|
||||
let Inst{73} = !if(P.NegHi01, src1_modifiers{1}, 0); // neg_hi src1
|
||||
let Inst{72} = !if(P.NegHi0, src0_modifiers{1}, 0); // neg_hi src0
|
||||
let Inst{73} = !if(P.NegHi1, src1_modifiers{1}, 0); // neg_hi src1
|
||||
let Inst{74} = !if(P.NegHi2, src2_modifiers{1}, 0); // neg_hi src2
|
||||
|
||||
let Inst{77-75} = !if(P.HasMatrixFMT, matrix_a_fmt{2-0}, 0); // op_sel
|
||||
@ -2354,8 +2376,8 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
|
||||
let Inst{122-114} = !if(P.HasSrc2, src2, ?);
|
||||
|
||||
// neg_lo
|
||||
let Inst{125} = !if(P.NegLo01, src0_modifiers{0}, 0);
|
||||
let Inst{126} = !if(P.NegLo01, src1_modifiers{0}, 0);
|
||||
let Inst{125} = !if(P.NegLo0, src0_modifiers{0}, 0);
|
||||
let Inst{126} = !if(P.NegLo1, src1_modifiers{0}, 0);
|
||||
let Inst{127} = !if(P.NegLo2, src2_modifiers{0}, 0);
|
||||
}
|
||||
|
||||
|
||||
@ -211,9 +211,9 @@ define amdgpu_kernel void @wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B,
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 false, <16 x bfloat> %A, i1 false, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
|
||||
; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
|
||||
define amdgpu_kernel void @wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
|
||||
%tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
|
||||
store <8 x float> %tmp0, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -0,0 +1,9 @@
|
||||
; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
|
||||
|
||||
; CHECK: LLVM ERROR: wmma.f32.16x16x32.bf16.v8f32.v16bf16 matrix A and B modifiers shall be zero
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 true, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 true)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
14
llvm/test/Bitcode/amdgpu-wmma-drop-ab-mods-upgrade.ll
Normal file
14
llvm/test/Bitcode/amdgpu-wmma-drop-ab-mods-upgrade.ll
Normal file
@ -0,0 +1,14 @@
|
||||
; RUN: llvm-as < %s | llvm-dis | FileCheck %s
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; CHECK-NEXT: bb:
|
||||
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 true)
|
||||
; CHECK-NEXT: store <8 x float> %res, ptr addrspace(1) %out, align 32
|
||||
; CHECK-NEXT: ret void
|
||||
; CHECK-NEXT: }
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 true)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
@ -45,7 +45,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloa
|
||||
; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 true)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 true)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
@ -2957,7 +2957,7 @@ bb:
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat>, <16 x bfloat>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
|
||||
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
|
||||
|
||||
@ -131,7 +131,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloa
|
||||
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
@ -165,7 +165,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <
|
||||
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
@ -201,7 +201,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_inlineable(<16 x bfloat>
|
||||
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
@ -3009,7 +3009,7 @@ bb:
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat>, <16 x bfloat>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
|
||||
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
|
||||
|
||||
@ -122,54 +122,6 @@ bb:
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negA:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negA:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||
; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 1, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negB:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||
; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negB:
|
||||
; GISEL: ; %bb.0: ; %bb
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GISEL-NEXT: s_clause 0x1
|
||||
; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||
; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 1, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
|
||||
; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negC:
|
||||
; GFX1250: ; %bb.0: ; %bb
|
||||
@ -189,7 +141,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x
|
||||
; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 1, <8 x float> %C, i1 false, i1 false)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 1, <8 x float> %C, i1 false, i1 false)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
@ -213,7 +165,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <1
|
||||
; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 3, <8 x float> %C, i1 false, i1 false)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 3, <8 x float> %C, i1 false, i1 false)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
@ -237,7 +189,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16
|
||||
; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||
; GISEL-NEXT: s_endpgm
|
||||
bb:
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 4, <8 x float> %C, i1 false, i1 false)
|
||||
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 4, <8 x float> %C, i1 false, i1 false)
|
||||
store <8 x float> %res, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
@ -2527,7 +2479,7 @@ bb:
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat>, <16 x bfloat>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
|
||||
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
|
||||
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
|
||||
|
||||
@ -244,7 +244,7 @@ body: |
|
||||
; GCN-NEXT: s_set_vgpr_msb 0xb55
|
||||
; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1
|
||||
; GCN-NEXT: v_wmma_f32_16x16x32_bf16 v[14:21] /*v[270:277]*/, v[26:33] /*v[282:289]*/, v[34:41] /*v[290:297]*/, v[14:21] /*v[270:277]*/
|
||||
early-clobber $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, undef $vgpr282_vgpr283_vgpr284_vgpr285_vgpr286_vgpr287_vgpr288_vgpr289, 8, undef $vgpr290_vgpr291_vgpr292_vgpr293_vgpr294_vgpr295_vgpr296_vgpr297, 8, killed undef $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277, 0, 0, 0, 0, implicit $exec
|
||||
early-clobber $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr undef $vgpr282_vgpr283_vgpr284_vgpr285_vgpr286_vgpr287_vgpr288_vgpr289, undef $vgpr290_vgpr291_vgpr292_vgpr293_vgpr294_vgpr295_vgpr296_vgpr297, 8, killed undef $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277, 0, 0, 0, 0, implicit $exec
|
||||
|
||||
; ASM: NumVgprs: 1024
|
||||
|
||||
|
||||
@ -13,13 +13,13 @@ name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
@ -28,13 +28,13 @@ name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_valus_in_between
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_valus_in_between
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: $vgpr26 = V_MOV_B32_e32 26, implicit $exec
|
||||
; GFX1250-NEXT: $vgpr27 = V_MOV_B32_e32 27, implicit $exec
|
||||
; GFX1250-NEXT: $vgpr28 = V_MOV_B32_e32 28, implicit $exec
|
||||
; GFX1250-NEXT: $vgpr29 = V_MOV_B32_e32 29, implicit $exec
|
||||
; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr26 = V_MOV_B32_e32 26, implicit $exec
|
||||
$vgpr27 = V_MOV_B32_e32 27, implicit $exec
|
||||
$vgpr28 = V_MOV_B32_e32 28, implicit $exec
|
||||
@ -47,7 +47,7 @@ name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_salus_in_between
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_salus_in_between
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
|
||||
; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
|
||||
; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
|
||||
@ -57,7 +57,7 @@ body: |
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$sgpr0 = S_MOV_B32 0
|
||||
$sgpr1 = S_MOV_B32 1
|
||||
$sgpr2 = S_MOV_B32 2
|
||||
@ -70,13 +70,13 @@ name: test_wmma_f32_16x16x32_bf16_D0_overlaps_D1
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_D1
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
@ -85,13 +85,13 @@ name: test_wmma_f32_16x16x32_bf16_D0_overlaps_vopd_vdstX
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_vopd_vdstX
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: $vgpr16, $vgpr255 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr1, undef $vgpr0, undef $vgpr0, implicit $mode, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16, $vgpr255 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr1, undef $vgpr0, undef $vgpr0, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
@ -100,13 +100,13 @@ name: test_wmma_f32_16x16x32_bf16_A0_overlaps_D1
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_A0_overlaps_D1
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr0 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
@ -115,13 +115,13 @@ name: test_wmma_f32_16x16x32_bf16_A0_overlaps_vopd_vdstY
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_A0_overlaps_vopd_vdstY
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: $vgpr255, $vgpr0 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr1, undef $vgpr0, undef $vgpr0, implicit $mode, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr255, $vgpr0 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr1, undef $vgpr0, undef $vgpr0, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
@ -130,13 +130,13 @@ name: test_wmma_f32_16x16x32_bf16_B0_overlaps_D1
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_B0_overlaps_D1
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr8 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
@ -936,13 +936,13 @@ name: test_wmma_trans_B0_overlaps_D1
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX1250-LABEL: name: test_wmma_trans_B0_overlaps_D1
|
||||
; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: $vgpr8 = V_EXP_F32_e32 $vgpr34, implicit $mode, implicit $exec
|
||||
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr8 = V_EXP_F32_e32 $vgpr34, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
@ -951,8 +951,8 @@ name: test_wmma_tdm_load_D0_overlaps_Use1
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX1250-LABEL: name: test_wmma_tdm_load_D0_overlaps_Use1
|
||||
; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR killed $sgpr0_sgpr1, killed $vgpr24, killed $vgpr0, 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
|
||||
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
|
||||
GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR killed $sgpr0_sgpr1, killed $vgpr24, killed $vgpr0, 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
|
||||
...
|
||||
|
||||
@ -46,15 +46,15 @@ name: test_wmma_f32_16x16x32_bf16_D0_overlaps_A1
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_A1
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
@ -62,15 +62,15 @@ name: test_wmma_f32_16x16x32_bf16_D0_overlaps_B1
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_B1
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
@ -78,14 +78,14 @@ name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Index1
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Index1
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: V_NOP_e32 implicit $exec
|
||||
; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
|
||||
@ -11,7 +11,7 @@ body: |
|
||||
; NOHOIST: bb.0:
|
||||
; NOHOIST-NEXT: successors: %bb.1(0x80000000)
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: S_BRANCH %bb.1
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
; NOHOIST-NEXT: bb.1:
|
||||
@ -28,7 +28,7 @@ body: |
|
||||
; HOIST: bb.0:
|
||||
; HOIST-NEXT: successors: %bb.1(0x80000000)
|
||||
; HOIST-NEXT: {{ $}}
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
@ -42,7 +42,7 @@ body: |
|
||||
; HOIST-NEXT: S_BRANCH %bb.1
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
bb.1:
|
||||
successors: %bb.1
|
||||
@ -63,7 +63,7 @@ body: |
|
||||
; NOHOIST-NEXT: bb.1:
|
||||
; NOHOIST-NEXT: successors: %bb.1(0x80000000)
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; NOHOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; NOHOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
@ -80,7 +80,7 @@ body: |
|
||||
; HOIST-NEXT: bb.1:
|
||||
; HOIST-NEXT: successors: %bb.1(0x80000000)
|
||||
; HOIST-NEXT: {{ $}}
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
@ -93,7 +93,7 @@ body: |
|
||||
bb.1:
|
||||
successors: %bb.1
|
||||
; WMMA inside the loop writes to vgpr16-23, VALU reads vgpr16
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
...
|
||||
@ -106,13 +106,13 @@ body: |
|
||||
; NOHOIST: bb.0:
|
||||
; NOHOIST-NEXT: successors: %bb.1(0x80000000)
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: S_BRANCH %bb.1
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
; NOHOIST-NEXT: bb.1:
|
||||
; NOHOIST-NEXT: successors: %bb.1(0x80000000)
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
; NOHOIST-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, 8, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, 8, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; NOHOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; NOHOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
@ -123,7 +123,7 @@ body: |
|
||||
; HOIST: bb.0:
|
||||
; HOIST-NEXT: successors: %bb.1(0x80000000)
|
||||
; HOIST-NEXT: {{ $}}
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
@ -132,18 +132,18 @@ body: |
|
||||
; HOIST-NEXT: bb.1:
|
||||
; HOIST-NEXT: successors: %bb.1(0x80000000)
|
||||
; HOIST-NEXT: {{ $}}
|
||||
; HOIST-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, 8, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, 8, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
; HOIST-NEXT: S_BRANCH %bb.1
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
; External WMMA writes to vgpr16-23
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
bb.1:
|
||||
successors: %bb.1
|
||||
; Loop WMMA writes to vgpr56-63 (different registers)
|
||||
$vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, 8, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, 8, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, 0, implicit $exec
|
||||
; This reads vgpr16 from the external WMMA
|
||||
$vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
@ -158,7 +158,7 @@ body: |
|
||||
; NOHOIST: bb.0:
|
||||
; NOHOIST-NEXT: successors: %bb.1(0x80000000)
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: S_BRANCH %bb.1
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
; NOHOIST-NEXT: bb.1:
|
||||
@ -185,7 +185,7 @@ body: |
|
||||
; HOIST: bb.0:
|
||||
; HOIST-NEXT: successors: %bb.1(0x80000000)
|
||||
; HOIST-NEXT: {{ $}}
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
@ -210,7 +210,7 @@ body: |
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
; WMMA outside all loops - writes to vgpr16-23
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
bb.1:
|
||||
; Outer loop header - can exit to bb.3 or continue to bb.2
|
||||
@ -244,7 +244,7 @@ body: |
|
||||
; NOHOIST-NEXT: bb.1:
|
||||
; NOHOIST-NEXT: successors: %bb.2(0x80000000)
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: S_BRANCH %bb.2
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
; NOHOIST-NEXT: bb.2:
|
||||
@ -287,7 +287,7 @@ body: |
|
||||
; HOIST-NEXT: bb.1:
|
||||
; HOIST-NEXT: successors: %bb.2(0x80000000)
|
||||
; HOIST-NEXT: {{ $}}
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
@ -325,7 +325,7 @@ body: |
|
||||
S_BRANCH %bb.1
|
||||
bb.1:
|
||||
successors: %bb.2
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
S_BRANCH %bb.2
|
||||
bb.2:
|
||||
successors: %bb.3
|
||||
@ -359,7 +359,7 @@ body: |
|
||||
; NOHOIST: bb.0:
|
||||
; NOHOIST-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: S_CBRANCH_SCC1 %bb.2, implicit undef $scc
|
||||
; NOHOIST-NEXT: S_BRANCH %bb.1
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
@ -386,7 +386,7 @@ body: |
|
||||
; HOIST: bb.0:
|
||||
; HOIST-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
||||
; HOIST-NEXT: {{ $}}
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: S_CBRANCH_SCC1 %bb.2, implicit undef $scc
|
||||
; HOIST-NEXT: S_BRANCH %bb.1
|
||||
; HOIST-NEXT: {{ $}}
|
||||
@ -410,7 +410,7 @@ body: |
|
||||
; HOIST-NEXT: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1, %bb.2
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
S_CBRANCH_SCC1 %bb.2, implicit undef $scc
|
||||
S_BRANCH %bb.1
|
||||
bb.1:
|
||||
@ -434,7 +434,7 @@ body: |
|
||||
; NOHOIST: bb.0:
|
||||
; NOHOIST-NEXT: successors: %bb.1(0x80000000)
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: {{ $}}
|
||||
; NOHOIST-NEXT: bb.1:
|
||||
; NOHOIST-NEXT: successors: %bb.1(0x80000000)
|
||||
@ -450,7 +450,7 @@ body: |
|
||||
; HOIST: bb.0:
|
||||
; HOIST-NEXT: successors: %bb.1(0x80000000)
|
||||
; HOIST-NEXT: {{ $}}
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
@ -463,7 +463,7 @@ body: |
|
||||
; HOIST-NEXT: S_BRANCH %bb.1
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
bb.1:
|
||||
successors: %bb.1
|
||||
$vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
@ -476,7 +476,7 @@ name: test_not_in_loop_no_hoist
|
||||
body: |
|
||||
bb.0:
|
||||
; NOHOIST-LABEL: name: test_not_in_loop_no_hoist
|
||||
; NOHOIST: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; NOHOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; NOHOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
@ -484,13 +484,13 @@ body: |
|
||||
; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
;
|
||||
; HOIST-LABEL: name: test_not_in_loop_no_hoist
|
||||
; HOIST: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
@ -512,7 +512,7 @@ body: |
|
||||
; NOHOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; NOHOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; NOHOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; NOHOIST-NEXT: S_BRANCH %bb.1
|
||||
;
|
||||
; HOIST-LABEL: name: test_valu_before_wmma_backedge_no_hoist
|
||||
@ -529,7 +529,7 @@ body: |
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: V_NOP_e32 implicit $exec
|
||||
; HOIST-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
; HOIST-NEXT: S_BRANCH %bb.1
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
@ -537,6 +537,6 @@ body: |
|
||||
bb.1:
|
||||
successors: %bb.1
|
||||
$vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
$vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
...
|
||||
|
||||
@ -1393,7 +1393,7 @@ def ROCDL_wmma_f32_16x16x16_bf8_fp8 : ROCDL_WMMA_IntrOp<"wmma.f32.16x16x16.bf8_f
|
||||
def ROCDL_wmma_i32_16x16x32_iu4 : ROCDL_WMMA_IU_IntrOp<"wmma.i32.16x16x32.iu4", AnyInteger, AnyInteger>;
|
||||
// Available from gfx1250
|
||||
def ROCDL_wmma_f32_16x16x4_f32 : ROCDL_WMMA_ModsAll_Reuse_IntrOp<"wmma.f32.16x16x4.f32", F32, F32>;
|
||||
def ROCDL_wmma_f32_16x16x32_bf16 : ROCDL_WMMA_ModsAll_Reuse_IntrOp<"wmma.f32.16x16x32.bf16", BF16, F32>;
|
||||
def ROCDL_wmma_f32_16x16x32_bf16 : ROCDL_WMMA_ModsC_IntrOp<"wmma.f32.16x16x32.bf16", BF16, F32>;
|
||||
def ROCDL_wmma_f32_16x16x32_f16 : ROCDL_WMMA_ModsAll_Reuse_IntrOp<"wmma.f32.16x16x32.f16", F16, F32>;
|
||||
def ROCDL_wmma_f16_16x16x32_f16 : ROCDL_WMMA_ModsAll_Reuse_IntrOp<"wmma.f16.16x16x32.f16", F16, F16>;
|
||||
def ROCDL_wmma_bf16_16x16x32_bf16 : ROCDL_WMMA_ModsAll_Reuse_IntrOp<"wmma.bf16.16x16x32.bf16", BF16, BF16>;
|
||||
|
||||
@ -1036,7 +1036,7 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v
|
||||
%r2.gfx1250 = rocdl.wmma.f32.16x16x32.f16 %arg1, %arg1, %arg12 {signA = false, signB = false, modC = 0 : i16} : (vector<16xf16>, vector<16xf16>, vector<32xf32>) -> vector<32xf32>
|
||||
|
||||
// bf16 -> f32
|
||||
// CHECK: call <32 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v32f32.v16bf16(i1 false, <16 x bfloat> %{{.*}} i1 false, <16 x bfloat> %{{.*}} i16 0, <32 x float> %{{.*}} i1 false, i1 false)
|
||||
// CHECK: call <32 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v32f32.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} i16 0, <32 x float> %{{.*}} i1 false, i1 false)
|
||||
%r3.gfx1250 = rocdl.wmma.f32.16x16x32.bf16 %arg16, %arg16, %arg12 {signA = false, signB = false, modC = 0 : i16} : (vector<16xbf16>, vector<16xbf16>, vector<32xf32>) -> vector<32xf32>
|
||||
|
||||
// f16 -> f16
|
||||
@ -1120,9 +1120,9 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v
|
||||
// CHECK: call <32 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v32f32.v16f16(i1 false, <16 x half> %{{.*}} i1 true, <16 x half> %{{.*}} i16 2, <32 x float> %{{.*}} i1 true, i1 false)
|
||||
%r2a.gfx1250 = rocdl.wmma.f32.16x16x32.f16 %arg1, %arg1, %arg12 {signA = false, signB = true, modC = 2 : i16, reuseA = true, reuseB = false} : (vector<16xf16>, vector<16xf16>, vector<32xf32>) -> vector<32xf32>
|
||||
|
||||
// Test with modC=3 and signA=true, signB=true, reuseB=true for bf16 gfx1250
|
||||
// CHECK: call <32 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v32f32.v16bf16(i1 true, <16 x bfloat> %{{.*}} i1 true, <16 x bfloat> %{{.*}} i16 3, <32 x float> %{{.*}} i1 false, i1 true)
|
||||
%r3a.gfx1250 = rocdl.wmma.f32.16x16x32.bf16 %arg16, %arg16, %arg12 {signA = true, signB = true, modC = 3 : i16, reuseA = false, reuseB = true} : (vector<16xbf16>, vector<16xbf16>, vector<32xf32>) -> vector<32xf32>
|
||||
// Test with modC=3 and signA=false, signB=false, reuseB=true for bf16 gfx1250
|
||||
// CHECK: call <32 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v32f32.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} i16 3, <32 x float> %{{.*}} i1 false, i1 true)
|
||||
%r3a.gfx1250 = rocdl.wmma.f32.16x16x32.bf16 %arg16, %arg16, %arg12 {signA = false, signB = false, modC = 3 : i16, reuseA = false, reuseB = true} : (vector<16xbf16>, vector<16xbf16>, vector<32xf32>) -> vector<32xf32>
|
||||
|
||||
// ---- Wave64 -----
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user