[AMDGPU] Fix destination op_sel for v_cvt_scale32_* and v_cvt_sr_* (#151411)

GFX950 uses OP_SEL[MSB:LSB] for both src reads and dest writes. So this
patch essentially revert the work from
https://github.com/llvm/llvm-project/pull/151286 regarding dest writes.
This commit is contained in:
Changpeng Fang 2025-07-30 16:15:50 -07:00 committed by GitHub
parent ed940d7228
commit 7d2332391f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 37 additions and 35 deletions

View File

@ -7038,13 +7038,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(
(MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
(MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
}
void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
? (int64_t)SISrcMods::DST_OP_SEL
: 0);
}

View File

@ -1015,8 +1015,10 @@ class SrcAndDstSelToOpSelXForm<int modifier_idx, bit dest_sel> : SDNodeXForm<tim
if (}] # modifier_idx # [{ == 0) {
New = (}] # dest_sel # [{ == 1) ? ((Val & 0x1) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL)
: ((Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE);
} else if (}] # modifier_idx # [{== 1 || }] # modifier_idx # [{ == 2) {
New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
} else if (}] # modifier_idx # [{== 1) {
New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
} if (}] # modifier_idx # [{== 2) {
New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
}
return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32);
}]>;
@ -1060,7 +1062,7 @@ def gi_SrcSelToOpSelXForm : GICustomOperandRenderer<"renderSrcSelToOpSelXForm">,
def DstSelToOpSel3XForm : SDNodeXForm<timm, [{
uint32_t V = N->getZExtValue();
return CurDAG->getTargetConstant(
(V & 0x1) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE,
(V & 0x2) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE,
SDLoc(N), MVT::i32);
}]>;
def gi_DstSelToOpSel3XForm : GICustomOperandRenderer<"renderDstSelToOpSel3XFormXForm">,

View File

@ -813,7 +813,7 @@ define i32 @test_cvt_scale_fp4_f32_byte1(i32 %old, float %src0, float %src1, flo
; GCN-LABEL: test_cvt_scale_fp4_f32_byte1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 1)
ret i32 %ret
@ -823,7 +823,7 @@ define i32 @test_cvt_scale_fp4_f32_byte2(i32 %old, float %src0, float %src1, flo
; GCN-LABEL: test_cvt_scale_fp4_f32_byte2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,0]
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 2)
ret i32 %ret
@ -1302,7 +1302,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte1(<2 x half> %src0, float %scale, i32
; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,0,1]
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,0]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
@ -1314,7 +1314,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte2(<2 x half> %src0, float %scale, i32
; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,0]
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,0,1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
@ -1380,7 +1380,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte1(<2 x bfloat> %src0, float %scale, i
; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,0,1]
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,0]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
@ -1392,7 +1392,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte2(<2 x bfloat> %src0, float %scale, i
; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,0]
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,0,1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
@ -2072,7 +2072,7 @@ define i32 @test_cvt_scale_fp4_f32_byte1_inreg_src(i32 %old, float inreg %src0,
; GCN-LABEL: test_cvt_scale_fp4_f32_byte1_inreg_src:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,0,1]
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 1)
ret i32 %ret
@ -2082,7 +2082,7 @@ define i32 @test_cvt_scale_fp4_f32_byte2_inreg_src(i32 %old, float inreg %src0,
; GCN-LABEL: test_cvt_scale_fp4_f32_byte2_inreg_src:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,0]
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,0,1]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 2)
ret i32 %ret
@ -2515,7 +2515,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte1_inreg_src(<2 x half> inreg %src0, fl
; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte1_inreg_src:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,0,1]
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,0]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
@ -2527,7 +2527,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte2_inreg_src(<2 x half> inreg %src0, fl
; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte2_inreg_src:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,0]
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,0,1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
@ -2562,7 +2562,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte1_inreg_src(<2 x bfloat> inreg %src0,
; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte1_inreg_src:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,0,1]
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,0]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
@ -2574,7 +2574,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte2_inreg_src(<2 x bfloat> inreg %src0,
; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte2_inreg_src:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,0]
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,0,1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]

View File

@ -28,7 +28,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_bf16_dst_sel_1(ptr addrspace(1)
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -42,7 +42,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_bf16_dst_sel_2(ptr addrspace(1)
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -84,7 +84,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f16_dst_sel_1(ptr addrspace(1) %
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -98,7 +98,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f16_dst_sel_2(ptr addrspace(1) %
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -140,7 +140,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f32_dst_sel_1(ptr addrspace(1) %
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -154,7 +154,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f32_dst_sel_2(ptr addrspace(1) %
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -196,7 +196,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_bf16_dst_sel_1(ptr addrspace(1)
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -210,7 +210,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_bf16_dst_sel_2(ptr addrspace(1)
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -252,7 +252,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f16_dst_sel_1(ptr addrspace(1) %
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -266,7 +266,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f16_dst_sel_2(ptr addrspace(1) %
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -308,7 +308,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f32_dst_sel_1(ptr addrspace(1) %
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -322,7 +322,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f32_dst_sel_2(ptr addrspace(1) %
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4

View File

@ -25,7 +25,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_1(ptr addrspace(1) %o
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v6, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -39,7 +39,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_2(ptr addrspace(1) %o
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v6, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -81,7 +81,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_1(ptr addrspace(1) %
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v6, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -95,7 +95,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_2(ptr addrspace(1) %
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v6, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -137,7 +137,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_1(ptr addrspace(1) %o
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v6, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,0,1]
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v7, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
@ -151,7 +151,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_2(ptr addrspace(1) %o
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v6, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,1,0]
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v7, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4