From e71da01f0f908417723a54cf8829a734a37fa173 Mon Sep 17 00:00:00 2001 From: Guo Chen Date: Tue, 24 Mar 2026 15:03:00 -0400 Subject: [PATCH] [AMDGPU][True16] add true16 pattern for cvt_pk_fp32_f8 (#180096) --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 21 +- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll | 414 ++++++++++++------ 2 files changed, 299 insertions(+), 136 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 1c1282c0791f..d40378a5ac4b 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -821,12 +821,25 @@ class Cvt_PK_F32_F8_Pat_OpSel; +class Cvt_PK_F32_F8_Pat_t16 : GCNPat< + (v2f32 (node i32:$src, index)), + (inst_e64 0, (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS $src, VGPR_32)), + !if(index, hi16, lo16)), 0) +>; + let SubtargetPredicate = isGFX11Plus, OtherPredicates = [HasFP8ConversionInsts] in { foreach Index = [0, -1] in { - def : Cvt_PK_F32_F8_Pat_OpSel; - def : Cvt_PK_F32_F8_Pat_OpSel; + let True16Predicate = UseFakeTrue16Insts in { + def : Cvt_PK_F32_F8_Pat_OpSel; + def : Cvt_PK_F32_F8_Pat_OpSel; + } + let True16Predicate = UseRealTrue16Insts in { + def : Cvt_PK_F32_F8_Pat_t16; + def : Cvt_PK_F32_F8_Pat_t16; + } } } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll index d847e8afcb0e..a267d2a515ae 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll @@ -305,28 +305,51 @@ define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) { ; GFX9X-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 ; GFX9X-NEXT: s_setpc_b64 s[30:31] ; -; GFX1170-LABEL: test_cvt_pk_f32_bf8_word0: -; GFX1170: ; %bb.0: -; GFX1170-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1170-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 -; GFX1170-NEXT: s_setpc_b64 s[30:31] +; GFX1170-TRUE16-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX1170-TRUE16: ; %bb.0: +; GFX1170-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1170-TRUE16-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0.l +; GFX1170-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: test_cvt_pk_f32_bf8_word0: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX1170-FAKE16-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX1170-FAKE16: ; %bb.0: +; GFX1170-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1170-FAKE16-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 +; GFX1170-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: test_cvt_pk_f32_bf8_word0: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX12-TRUE16-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) ret <2 x float> %ret } @@ -338,28 +361,51 @@ define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) { ; GFX9X-NEXT: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1 ; GFX9X-NEXT: s_setpc_b64 s[30:31] ; -; GFX1170-LABEL: test_cvt_pk_f32_bf8_word1: -; GFX1170: ; %bb.0: -; GFX1170-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1170-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] -; GFX1170-NEXT: s_setpc_b64 s[30:31] +; GFX1170-TRUE16-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX1170-TRUE16: ; %bb.0: +; GFX1170-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1170-TRUE16-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0.h +; GFX1170-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: test_cvt_pk_f32_bf8_word1: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX1170-FAKE16-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX1170-FAKE16: ; %bb.0: +; GFX1170-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1170-FAKE16-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX1170-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: test_cvt_pk_f32_bf8_word1: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX12-TRUE16-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0.h +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0.h +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true) ret <2 x float> %ret } @@ -371,28 +417,51 @@ define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) { ; GFX9X-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 ; GFX9X-NEXT: s_setpc_b64 s[30:31] ; -; GFX1170-LABEL: test_cvt_pk_f32_fp8_word0: -; GFX1170: ; %bb.0: -; GFX1170-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1170-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 -; GFX1170-NEXT: s_setpc_b64 s[30:31] +; GFX1170-TRUE16-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX1170-TRUE16: ; %bb.0: +; GFX1170-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1170-TRUE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0.l +; GFX1170-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: test_cvt_pk_f32_fp8_word0: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX1170-FAKE16-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX1170-FAKE16: ; %bb.0: +; GFX1170-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1170-FAKE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX1170-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: test_cvt_pk_f32_fp8_word0: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX12-TRUE16-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false) ret <2 x float> %ret } @@ -404,28 +473,51 @@ define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) { ; GFX9X-NEXT: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1 ; GFX9X-NEXT: s_setpc_b64 s[30:31] ; -; GFX1170-LABEL: test_cvt_pk_f32_fp8_word1: -; GFX1170: ; %bb.0: -; GFX1170-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1170-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0] -; GFX1170-NEXT: s_setpc_b64 s[30:31] +; GFX1170-TRUE16-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX1170-TRUE16: ; %bb.0: +; GFX1170-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1170-TRUE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0.h +; GFX1170-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: test_cvt_pk_f32_fp8_word1: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0] -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX1170-FAKE16-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX1170-FAKE16: ; %bb.0: +; GFX1170-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1170-FAKE16-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0] +; GFX1170-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: test_cvt_pk_f32_fp8_word1: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX12-TRUE16-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0.h +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0] +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0.h +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0] +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) ret <2 x float> %ret } @@ -1124,34 +1216,63 @@ define <2 x float> @test_sext_cvt_pk_f32_bf8_word1(i16 %a) { ; GFX9X-NEXT: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1 ; GFX9X-NEXT: s_setpc_b64 s[30:31] ; -; GFX1170-LABEL: test_sext_cvt_pk_f32_bf8_word1: -; GFX1170: ; %bb.0: -; GFX1170-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1170-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] -; GFX1170-NEXT: s_setpc_b64 s[30:31] +; GFX1170-TRUE16-LABEL: test_sext_cvt_pk_f32_bf8_word1: +; GFX1170-TRUE16: ; %bb.0: +; GFX1170-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1170-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1170-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-TRUE16-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0.h +; GFX1170-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: test_sext_cvt_pk_f32_bf8_word1: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX1170-FAKE16-LABEL: test_sext_cvt_pk_f32_bf8_word1: +; GFX1170-FAKE16: ; %bb.0: +; GFX1170-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1170-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1170-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-FAKE16-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX1170-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: test_sext_cvt_pk_f32_bf8_word1: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX12-TRUE16-LABEL: test_sext_cvt_pk_f32_bf8_word1: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0.h +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: test_sext_cvt_pk_f32_bf8_word1: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: test_sext_cvt_pk_f32_bf8_word1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0.h +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: test_sext_cvt_pk_f32_bf8_word1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] %a.sext = sext i16 %a to i32 %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a.sext, i1 true) ret <2 x float> %ret @@ -1165,34 +1286,63 @@ define <2 x float> @test_sext_cvt_pk_f32_fp8_word0(i16 %a) { ; GFX9X-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 ; GFX9X-NEXT: s_setpc_b64 s[30:31] ; -; GFX1170-LABEL: test_sext_cvt_pk_f32_fp8_word0: -; GFX1170: ; %bb.0: -; GFX1170-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1170-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 -; GFX1170-NEXT: s_setpc_b64 s[30:31] +; GFX1170-TRUE16-LABEL: test_sext_cvt_pk_f32_fp8_word0: +; GFX1170-TRUE16: ; %bb.0: +; GFX1170-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1170-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1170-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-TRUE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0.l +; GFX1170-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: test_sext_cvt_pk_f32_fp8_word0: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX1170-FAKE16-LABEL: test_sext_cvt_pk_f32_fp8_word0: +; GFX1170-FAKE16: ; %bb.0: +; GFX1170-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1170-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1170-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1170-FAKE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX1170-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: test_sext_cvt_pk_f32_fp8_word0: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX12-TRUE16-LABEL: test_sext_cvt_pk_f32_fp8_word0: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: test_sext_cvt_pk_f32_fp8_word0: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: test_sext_cvt_pk_f32_fp8_word0: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: test_sext_cvt_pk_f32_fp8_word0: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] %a.sext = sext i16 %a to i32 %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a.sext, i1 false) ret <2 x float> %ret