[AMDGPU] v_cvt_scale_pk16 gfx1250 instructions (#151804)

This commit is contained in:
Stanislav Mekhanoshin 2025-08-02 10:45:02 -07:00 committed by GitHub
parent dfbf13cded
commit 7598c25b5a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 458 additions and 0 deletions

View File

@ -716,6 +716,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, "V8yUiUiIUi", "nc", "gfx
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk16_f16_fp6, "V16hV3UiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk16_bf16_fp6, "V16yV3UiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk16_f16_bf6, "V16hV3UiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk16_bf16_bf6, "V16yV3UiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk16_f32_fp6, "V16fV3UiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk16_f32_bf6, "V16fV3UiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", "gfx1250-insts")

View File

@ -93,6 +93,12 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_fp8: case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_fp8:
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_bf8: case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_bf8:
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_fp4: case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_fp4:
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f16_fp6:
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_bf16_fp6:
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f16_bf6:
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_bf16_bf6:
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_fp6:
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_bf6:
return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 7); return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 7);
} }
default: default:

View File

@ -655,6 +655,36 @@ void test_cvt_sr_fp8_f16(global int* out, half a, short sr, int old)
// CHECK-NEXT: [[TMP34:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 [[TMP32]], i32 [[TMP33]], i32 7) // CHECK-NEXT: [[TMP34:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 [[TMP32]], i32 [[TMP33]], i32 7)
// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <8 x float> [[TMP34]], ptr addrspace(1) [[TMP35]], align 32 // CHECK-NEXT: store <8 x float> [[TMP34]], ptr addrspace(1) [[TMP35]], align 32
// CHECK-NEXT: [[TMP36:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16
// CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP38:%.*]] = call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.fp6(<3 x i32> [[TMP36]], i32 [[TMP37]], i32 0)
// CHECK-NEXT: [[TMP39:%.*]] = load ptr addrspace(1), ptr [[OUTH16_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <16 x half> [[TMP38]], ptr addrspace(1) [[TMP39]], align 32
// CHECK-NEXT: [[TMP40:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16
// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP42:%.*]] = call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.fp6(<3 x i32> [[TMP40]], i32 [[TMP41]], i32 1)
// CHECK-NEXT: [[TMP43:%.*]] = load ptr addrspace(1), ptr [[OUTY16_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <16 x bfloat> [[TMP42]], ptr addrspace(1) [[TMP43]], align 32
// CHECK-NEXT: [[TMP44:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16
// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP46:%.*]] = call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.bf6(<3 x i32> [[TMP44]], i32 [[TMP45]], i32 2)
// CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr [[OUTH16_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <16 x half> [[TMP46]], ptr addrspace(1) [[TMP47]], align 32
// CHECK-NEXT: [[TMP48:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16
// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP50:%.*]] = call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.bf6(<3 x i32> [[TMP48]], i32 [[TMP49]], i32 3)
// CHECK-NEXT: [[TMP51:%.*]] = load ptr addrspace(1), ptr [[OUTY16_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <16 x bfloat> [[TMP50]], ptr addrspace(1) [[TMP51]], align 32
// CHECK-NEXT: [[TMP52:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16
// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP54:%.*]] = call <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.fp6(<3 x i32> [[TMP52]], i32 [[TMP53]], i32 3)
// CHECK-NEXT: [[TMP55:%.*]] = load ptr addrspace(1), ptr [[OUTF16_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <16 x float> [[TMP54]], ptr addrspace(1) [[TMP55]], align 64
// CHECK-NEXT: [[TMP56:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16
// CHECK-NEXT: [[TMP57:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP58:%.*]] = call <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.bf6(<3 x i32> [[TMP56]], i32 [[TMP57]], i32 4)
// CHECK-NEXT: [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUTF16_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <16 x float> [[TMP58]], ptr addrspace(1) [[TMP59]], align 64
// CHECK-NEXT: ret void // CHECK-NEXT: ret void
// //
void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2, void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
@ -672,6 +702,12 @@ void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, 5); *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, 5);
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, 6); *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, 6);
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7); *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
*outh16 = __builtin_amdgcn_cvt_scale_pk16_f16_fp6(src3, scale, 0);
*outy16 = __builtin_amdgcn_cvt_scale_pk16_bf16_fp6(src3, scale, 1);
*outh16 = __builtin_amdgcn_cvt_scale_pk16_f16_bf6(src3, scale, 2);
*outy16 = __builtin_amdgcn_cvt_scale_pk16_bf16_bf6(src3, scale, 3);
*outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_fp6(src3, scale, 3);
*outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_bf6(src3, scale, 4);
} }
// CHECK-LABEL: @test_cvt_scalef32_pk( // CHECK-LABEL: @test_cvt_scalef32_pk(

View File

@ -57,6 +57,12 @@ void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_fp8' must be a constant integer}} *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_fp8' must be a constant integer}}
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_bf8' must be a constant integer}} *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_bf8' must be a constant integer}}
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_fp4' must be a constant integer}} *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_fp4' must be a constant integer}}
*outh16 = __builtin_amdgcn_cvt_scale_pk16_f16_fp6(src3, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk16_f16_fp6' must be a constant integer}}
*outy16 = __builtin_amdgcn_cvt_scale_pk16_bf16_fp6(src3, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk16_bf16_fp6' must be a constant integer}}
*outh16 = __builtin_amdgcn_cvt_scale_pk16_f16_bf6(src3, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk16_f16_bf6' must be a constant integer}}
*outy16 = __builtin_amdgcn_cvt_scale_pk16_bf16_bf6(src3, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk16_bf16_bf6' must be a constant integer}}
*outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_fp6(src3, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk16_f32_fp6' must be a constant integer}}
*outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_bf6(src3, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk16_f32_bf6' must be a constant integer}}
*outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
*outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
@ -67,6 +73,12 @@ void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
*outh16 = __builtin_amdgcn_cvt_scale_pk16_f16_fp6(src3, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
*outy16 = __builtin_amdgcn_cvt_scale_pk16_bf16_fp6(src3, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
*outh16 = __builtin_amdgcn_cvt_scale_pk16_f16_bf6(src3, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
*outy16 = __builtin_amdgcn_cvt_scale_pk16_bf16_bf6(src3, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
*outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_fp6(src3, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
*outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_bf6(src3, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
} }
void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global v4i* b128gaddr, int *b32faddr, v2i* b64faddr, v4i *b128faddr, void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global v4i* b128gaddr, int *b32faddr, v2i* b64faddr, v4i *b128faddr,

View File

@ -655,6 +655,12 @@ def int_amdgcn_cvt_scale_pk8_bf16_fp4 : AMDGPUCvtScaleIntrinsic<llvm_v8bf16_ty,
def int_amdgcn_cvt_scale_pk8_f32_fp8 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_v2i32_ty, "cvt_scale_pk8_f32_fp8">; def int_amdgcn_cvt_scale_pk8_f32_fp8 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_v2i32_ty, "cvt_scale_pk8_f32_fp8">;
def int_amdgcn_cvt_scale_pk8_f32_bf8 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_v2i32_ty, "cvt_scale_pk8_f32_bf8">; def int_amdgcn_cvt_scale_pk8_f32_bf8 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_v2i32_ty, "cvt_scale_pk8_f32_bf8">;
def int_amdgcn_cvt_scale_pk8_f32_fp4 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_i32_ty, "cvt_scale_pk8_f32_fp4">; def int_amdgcn_cvt_scale_pk8_f32_fp4 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_i32_ty, "cvt_scale_pk8_f32_fp4">;
def int_amdgcn_cvt_scale_pk16_f16_bf6 : AMDGPUCvtScaleIntrinsic<llvm_v16f16_ty, llvm_v3i32_ty, "cvt_scale_pk16_f16_bf6">;
def int_amdgcn_cvt_scale_pk16_bf16_bf6 : AMDGPUCvtScaleIntrinsic<llvm_v16bf16_ty, llvm_v3i32_ty, "cvt_scale_pk16_bf16_bf6">;
def int_amdgcn_cvt_scale_pk16_f16_fp6 : AMDGPUCvtScaleIntrinsic<llvm_v16f16_ty, llvm_v3i32_ty, "cvt_scale_pk16_f16_fp6">;
def int_amdgcn_cvt_scale_pk16_bf16_fp6 : AMDGPUCvtScaleIntrinsic<llvm_v16bf16_ty, llvm_v3i32_ty, "cvt_scale_pk16_bf16_fp6">;
def int_amdgcn_cvt_scale_pk16_f32_fp6 : AMDGPUCvtScaleIntrinsic<llvm_v16f32_ty, llvm_v3i32_ty, "cvt_scale_pk16_f32_fp6">;
def int_amdgcn_cvt_scale_pk16_f32_bf6 : AMDGPUCvtScaleIntrinsic<llvm_v16f32_ty, llvm_v3i32_ty, "cvt_scale_pk16_f32_bf6">;
class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMType Src1Ty, string name> : DefaultAttrsIntrinsic< class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMType Src1Ty, string name> : DefaultAttrsIntrinsic<
[DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] [DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]

View File

@ -4603,6 +4603,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8: case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8:
case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8: case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8:
case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4: case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4:
case Intrinsic::amdgcn_cvt_scale_pk16_f16_fp6:
case Intrinsic::amdgcn_cvt_scale_pk16_bf16_fp6:
case Intrinsic::amdgcn_cvt_scale_pk16_f16_bf6:
case Intrinsic::amdgcn_cvt_scale_pk16_bf16_bf6:
case Intrinsic::amdgcn_cvt_scale_pk16_f32_fp6:
case Intrinsic::amdgcn_cvt_scale_pk16_f32_bf6:
case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16: case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16:
case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16: case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16:
case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16: case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16:

View File

@ -2950,6 +2950,8 @@ def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>; def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>;
def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>; def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>;
def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>; def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>;
def VOP_V16F16_V3I32_I32 : VOPProfile<[v16f16, v3i32, i32, untyped]>;
def VOP_V16BF16_V3I32_I32 : VOPProfile<[v16bf16, v3i32, i32, untyped]>;
def VOP_V8F16_V2I32_I32 : VOPProfile<[v8f16, v2i32, i32, untyped]>; def VOP_V8F16_V2I32_I32 : VOPProfile<[v8f16, v2i32, i32, untyped]>;
def VOP_V8BF16_V2I32_I32 : VOPProfile<[v8bf16, v2i32, i32, untyped]>; def VOP_V8BF16_V2I32_I32 : VOPProfile<[v8bf16, v2i32, i32, untyped]>;
def VOP_V8F16_I32_I32 : VOPProfile<[v8f16, i32, i32, untyped]>; def VOP_V8F16_I32_I32 : VOPProfile<[v8f16, i32, i32, untyped]>;

View File

@ -1777,6 +1777,12 @@ let SubtargetPredicate = isGFX1250Plus in {
defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_bf8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_bf8>; defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_bf8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_bf8>;
defm V_CVT_SCALE_PK8_F32_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp8>; defm V_CVT_SCALE_PK8_F32_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp8>;
defm V_CVT_SCALE_PK8_F32_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_bf8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_bf8>; defm V_CVT_SCALE_PK8_F32_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_bf8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_bf8>;
defm V_CVT_SCALE_PK16_F16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_fp6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_fp6>;
defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_bf16_fp6", VOP_V16BF16_V3I32_I32, int_amdgcn_cvt_scale_pk16_bf16_fp6>;
defm V_CVT_SCALE_PK16_F16_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_bf6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_bf6>;
defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_bf16_bf6", VOP_V16BF16_V3I32_I32, int_amdgcn_cvt_scale_pk16_bf16_bf6>;
defm V_CVT_SCALE_PK16_F32_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f32_fp6", VOP_V16F32_V3I32_I32, int_amdgcn_cvt_scale_pk16_f32_fp6>;
defm V_CVT_SCALE_PK16_F32_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f32_bf6", VOP_V16F32_V3I32_I32, int_amdgcn_cvt_scale_pk16_f32_bf6>;
} // End Constraints = "@earlyclobber $vdst" } // End Constraints = "@earlyclobber $vdst"
defm V_CVT_SCALE_PK8_F16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp4", VOP_V8F16_I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp4>; defm V_CVT_SCALE_PK8_F16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp4", VOP_V8F16_I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp4>;
@ -2248,6 +2254,12 @@ defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x2c3>;
defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2c4>; defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2c4>;
defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x2c5>; defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x2c5>;
defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c6>; defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c6>;
defm V_CVT_SCALE_PK16_F16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c7>;
defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c8>;
defm V_CVT_SCALE_PK16_F32_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c9>;
defm V_CVT_SCALE_PK16_F16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2ca>;
defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cb>;
defm V_CVT_SCALE_PK16_F32_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cc>;
defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>; defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>;
defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>; defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>;
defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>; defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>;

View File

@ -11,6 +11,12 @@ declare <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp4(i32 %src, i32 %scale, i
declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> %src, i32 %scale, i32 %scale_sel) declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> %src, i32 %scale, i32 %scale_sel)
declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.bf8(<2 x i32> %src, i32 %scale, i32 %scale_sel) declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.bf8(<2 x i32> %src, i32 %scale, i32 %scale_sel)
declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 %src, i32 %scale, i32 %scale_sel) declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 %src, i32 %scale, i32 %scale_sel)
declare <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.fp6(<3 x i32> %src, i32 %scale, i32 %scale_sel)
declare <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.fp6(<3 x i32> %src, i32 %scale, i32 %scale_sel)
declare <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.bf6(<3 x i32> %src, i32 %scale, i32 %scale_sel)
declare <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.bf6(<3 x i32> %src, i32 %scale, i32 %scale_sel)
declare <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.fp6(<3 x i32> %src, i32 %scale, i32 %scale_sel)
declare <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.bf6(<3 x i32> %src, i32 %scale, i32 %scale_sel)
define amdgpu_ps void @test_cvt_scale_pk8_f16_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { define amdgpu_ps void @test_cvt_scale_pk8_f16_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f16_fp8_vv: ; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f16_fp8_vv:
@ -162,3 +168,207 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_fp4_vv(i32 %src, i32 %scale, ptr a
store <8 x float> %cvt, ptr addrspace(1) %out, align 32 store <8 x float> %cvt, ptr addrspace(1) %out, align 32
ret void ret void
} }
define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_fp6_vv:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f16_fp6 v[6:13], v[0:2], v3
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_fp6_vv:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f16_fp6 v[6:13], v[0:2], v3
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1250-GISEL-NEXT: s_endpgm
%cvt = tail call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.fp6(<3 x i32> %src, i32 %scale, i32 0)
store <16 x half> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) {
; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_fp6_sl:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v12, s2
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f16_fp6 v[2:9], v[10:12], 0x64 scale_sel:1
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_fp6_sl:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v10, s0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f16_fp6 v[2:9], v[10:12], 0x64 scale_sel:1
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX1250-GISEL-NEXT: s_endpgm
%cvt = tail call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.fp6(<3 x i32> %src, i32 100, i32 1)
store <16 x half> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_scale_pk16_bf16_fp6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_cvt_scale_pk16_bf16_fp6_vv:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_cvt_scale_pk16_bf16_fp6 v[6:13], v[0:2], v3 scale_sel:2
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1250-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1250-NEXT: s_endpgm
%cvt = tail call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.fp6(<3 x i32> %src, i32 %scale, i32 2)
store <16 x bfloat> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_scale_pk16_bf16_fp6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_cvt_scale_pk16_bf16_fp6_sl:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1
; GFX1250-NEXT: v_mov_b32_e32 v12, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_scale_pk16_bf16_fp6 v[2:9], v[10:12], 0x64 scale_sel:3
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX1250-NEXT: s_endpgm
%cvt = tail call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.fp6(<3 x i32> %src, i32 100, i32 3)
store <16 x bfloat> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_bf6_vv:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f16_bf6 v[6:13], v[0:2], v3 scale_sel:4
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_bf6_vv:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f16_bf6 v[6:13], v[0:2], v3 scale_sel:4
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1250-GISEL-NEXT: s_endpgm
%cvt = tail call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.bf6(<3 x i32> %src, i32 %scale, i32 4)
store <16 x half> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) {
; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_bf6_sl:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v12, s2
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f16_bf6 v[2:9], v[10:12], 0x64 scale_sel:5
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_bf6_sl:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v10, s0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f16_bf6 v[2:9], v[10:12], 0x64 scale_sel:5
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX1250-GISEL-NEXT: s_endpgm
%cvt = tail call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.bf6(<3 x i32> %src, i32 100, i32 5)
store <16 x half> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_scale_pk16_bf16_bf6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_cvt_scale_pk16_bf16_bf6_vv:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_cvt_scale_pk16_bf16_bf6 v[6:13], v[0:2], v3 scale_sel:6
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1250-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1250-NEXT: s_endpgm
%cvt = tail call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.bf6(<3 x i32> %src, i32 %scale, i32 6)
store <16 x bfloat> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_scale_pk16_bf16_bf6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_cvt_scale_pk16_bf16_bf6_sl:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1
; GFX1250-NEXT: v_mov_b32_e32 v12, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_scale_pk16_bf16_bf6 v[2:9], v[10:12], 0x64 scale_sel:7
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX1250-NEXT: s_endpgm
%cvt = tail call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.bf6(<3 x i32> %src, i32 100, i32 7)
store <16 x bfloat> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_scale_pk16_f32_fp6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f32_fp6_vv:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f32_fp6 v[6:21], v[0:2], v3 scale_sel:5
; GFX1250-SDAG-NEXT: s_clause 0x3
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:32
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f32_fp6_vv:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f32_fp6 v[6:21], v[0:2], v3 scale_sel:5
; GFX1250-GISEL-NEXT: s_clause 0x3
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[14:17], off offset:32
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48
; GFX1250-GISEL-NEXT: s_endpgm
%cvt = tail call <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.fp6(<3 x i32> %src, i32 %scale, i32 5)
store <16 x float> %cvt, ptr addrspace(1) %out, align 16
ret void
}
define amdgpu_ps void @test_cvt_scale_pk16_f32_bf6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f32_bf6_vv:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f32_bf6 v[6:21], v[0:2], v3 scale_sel:6
; GFX1250-SDAG-NEXT: s_clause 0x3
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:32
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f32_bf6_vv:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f32_bf6 v[6:21], v[0:2], v3 scale_sel:6
; GFX1250-GISEL-NEXT: s_clause 0x3
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[14:17], off offset:32
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48
; GFX1250-GISEL-NEXT: s_endpgm
%cvt = tail call <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.bf6(<3 x i32> %src, i32 %scale, i32 6)
store <16 x float> %cvt, ptr addrspace(1) %out, align 16
ret void
}

View File

@ -1039,3 +1039,57 @@ v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 100
v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4
// GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x09,0x12,0x02] // GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x09,0x12,0x02]
v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], v8
// GFX1250: v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], v8 ; encoding: [0x0a,0x00,0xcb,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], 0xcf00
// GFX1250: v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xcb,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], v8 scale_sel:1
// GFX1250: v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], v8 scale_sel:1 ; encoding: [0x0a,0x08,0xcb,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], v8
// GFX1250: v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], v8 ; encoding: [0x0a,0x00,0xca,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], 0xcf00
// GFX1250: v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xca,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], v8 scale_sel:2
// GFX1250: v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], v8 scale_sel:2 ; encoding: [0x0a,0x10,0xca,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], v8
// GFX1250: v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], v8 ; encoding: [0x0a,0x00,0xc8,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], 0xcf00
// GFX1250: v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xc8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], v8 scale_sel:3
// GFX1250: v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], v8 scale_sel:3 ; encoding: [0x0a,0x18,0xc8,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], v8
// GFX1250: v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], v8 ; encoding: [0x0a,0x00,0xc7,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], 0xcf00
// GFX1250: v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xc7,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], v8 scale_sel:4
// GFX1250: v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], v8 scale_sel:4 ; encoding: [0x0a,0x20,0xc7,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], v8
// GFX1250: v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], v8 ; encoding: [0x0a,0x00,0xc9,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], 0xcf00
// GFX1250: v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xc9,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], v8 scale_sel:4
// GFX1250: v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], v8 scale_sel:4 ; encoding: [0x0a,0x20,0xc9,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8
// GFX1250: v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8 ; encoding: [0x0a,0x00,0xcc,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], 0xcf00
// GFX1250: v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xcc,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8 scale_sel:5
// GFX1250: v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8 scale_sel:5 ; encoding: [0x0a,0x28,0xcc,0xd6,0x14,0x11,0x02,0x00]

View File

@ -1039,3 +1039,57 @@ v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 100
v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4
// GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x09,0x12,0x02] // GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x09,0x12,0x02]
v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], v8
// GFX1250: v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], v8 ; encoding: [0x0a,0x00,0xcb,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], 0xcf00
// GFX1250: v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xcb,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], v8 scale_sel:1
// GFX1250: v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], v8 scale_sel:1 ; encoding: [0x0a,0x08,0xcb,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], v8
// GFX1250: v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], v8 ; encoding: [0x0a,0x00,0xca,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], 0xcf00
// GFX1250: v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xca,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], v8 scale_sel:2
// GFX1250: v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], v8 scale_sel:2 ; encoding: [0x0a,0x10,0xca,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], v8
// GFX1250: v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], v8 ; encoding: [0x0a,0x00,0xc8,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], 0xcf00
// GFX1250: v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xc8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], v8 scale_sel:3
// GFX1250: v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], v8 scale_sel:3 ; encoding: [0x0a,0x18,0xc8,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], v8
// GFX1250: v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], v8 ; encoding: [0x0a,0x00,0xc7,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], 0xcf00
// GFX1250: v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xc7,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], v8 scale_sel:4
// GFX1250: v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], v8 scale_sel:4 ; encoding: [0x0a,0x20,0xc7,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], v8
// GFX1250: v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], v8 ; encoding: [0x0a,0x00,0xc9,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], 0xcf00
// GFX1250: v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xc9,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], v8 scale_sel:4
// GFX1250: v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], v8 scale_sel:4 ; encoding: [0x0a,0x20,0xc9,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8
// GFX1250: v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8 ; encoding: [0x0a,0x00,0xcc,0xd6,0x14,0x11,0x02,0x00]
v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], 0xcf00
// GFX1250: v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xcc,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8 scale_sel:5
// GFX1250: v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8 scale_sel:5 ; encoding: [0x0a,0x28,0xcc,0xd6,0x14,0x11,0x02,0x00]

View File

@ -1090,3 +1090,57 @@
0x02,0x00,0x43,0xd6,0x04,0x11,0x1a,0x04 0x02,0x00,0x43,0xd6,0x04,0x11,0x1a,0x04
# GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[8:9], v[6:7] ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x11,0x1a,0x04] # GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[8:9], v[6:7] ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x11,0x1a,0x04]
0x0a,0x00,0xcb,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
# GFX1250: v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xcb,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
0x0a,0x00,0xcb,0xd6,0x14,0x11,0x02,0x00
# GFX1250: v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], v8 ; encoding: [0x0a,0x00,0xcb,0xd6,0x14,0x11,0x02,0x00]
0x0a,0x08,0xcb,0xd6,0x14,0x11,0x02,0x00
# GFX1250: v_cvt_scale_pk16_bf16_bf6 v[10:17], v[20:22], v8 scale_sel:1 ; encoding: [0x0a,0x08,0xcb,0xd6,0x14,0x11,0x02,0x00]
0x0a,0x00,0xc8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
# GFX1250: v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xc8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
0x0a,0x00,0xc8,0xd6,0x14,0x11,0x02,0x00
# GFX1250: v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], v8 ; encoding: [0x0a,0x00,0xc8,0xd6,0x14,0x11,0x02,0x00]
0x0a,0x18,0xc8,0xd6,0x14,0x11,0x02,0x00
# GFX1250: v_cvt_scale_pk16_bf16_fp6 v[10:17], v[20:22], v8 scale_sel:3 ; encoding: [0x0a,0x18,0xc8,0xd6,0x14,0x11,0x02,0x00]
0x0a,0x00,0xca,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
# GFX1250: v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xca,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
0x0a,0x00,0xca,0xd6,0x14,0x11,0x02,0x00
# GFX1250: v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], v8 ; encoding: [0x0a,0x00,0xca,0xd6,0x14,0x11,0x02,0x00]
0x0a,0x10,0xca,0xd6,0x14,0x11,0x02,0x00
# GFX1250: v_cvt_scale_pk16_f16_bf6 v[10:17], v[20:22], v8 scale_sel:2 ; encoding: [0x0a,0x10,0xca,0xd6,0x14,0x11,0x02,0x00]
0x0a,0x00,0xc7,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
# GFX1250: v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xc7,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
0x0a,0x00,0xc7,0xd6,0x14,0x11,0x02,0x00
# GFX1250: v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], v8 ; encoding: [0x0a,0x00,0xc7,0xd6,0x14,0x11,0x02,0x00]
0x0a,0x20,0xc7,0xd6,0x14,0x11,0x02,0x00
# GFX1250: v_cvt_scale_pk16_f16_fp6 v[10:17], v[20:22], v8 scale_sel:4 ; encoding: [0x0a,0x20,0xc7,0xd6,0x14,0x11,0x02,0x00]
0x0a,0x00,0xcc,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
# GFX1250: v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xcc,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
0x0a,0x00,0xcc,0xd6,0x14,0x11,0x02,0x00
# GFX1250: v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8 ; encoding: [0x0a,0x00,0xcc,0xd6,0x14,0x11,0x02,0x00]
0x0a,0x28,0xcc,0xd6,0x14,0x11,0x02,0x00
# GFX1250: v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8 scale_sel:5 ; encoding: [0x0a,0x28,0xcc,0xd6,0x14,0x11,0x02,0x00]
0x0a,0x00,0xc9,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
# GFX1250: v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], 0xcf00 ; encoding: [0x0a,0x00,0xc9,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
0x0a,0x00,0xc9,0xd6,0x14,0x11,0x02,0x00
# GFX1250: v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], v8 ; encoding: [0x0a,0x00,0xc9,0xd6,0x14,0x11,0x02,0x00]
0x0a,0x20,0xc9,0xd6,0x14,0x11,0x02,0x00
# GFX1250: v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], v8 scale_sel:4 ; encoding: [0x0a,0x20,0xc9,0xd6,0x14,0x11,0x02,0x00]