From 7d2332391f81d44d7c9d1ca40bd5f393c59ad0df Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Wed, 30 Jul 2025 16:15:50 -0700 Subject: [PATCH] [AMDGPU] Fix destination op_sel for v_cvt_scale32_* and v_cvt_sr_* (#151411) GFX950 uses OP_SEL[MSB:LSB] for both src reads and dest writes. So this patch essentially revert the work from https://github.com/llvm/llvm-project/pull/151286 regarding dest writes. --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 4 ++-- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 8 ++++--- .../llvm.amdgcn.cvt.scalef32.pk.gfx950.ll | 24 +++++++++---------- .../AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll | 24 +++++++++---------- .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll | 12 +++++----- 5 files changed, 37 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index fab99f57e76c..b0d3b12471a3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -7038,13 +7038,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0); } diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 51f2c35a751e..6875c83e04d4 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1015,8 +1015,10 @@ class SrcAndDstSelToOpSelXForm : SDNodeXFormgetTargetConstant(New, SDLoc(N), MVT::i32); }]>; @@ -1060,7 +1062,7 @@ def gi_SrcSelToOpSelXForm : GICustomOperandRenderer<"renderSrcSelToOpSelXForm">, def DstSelToOpSel3XForm : SDNodeXFormgetZExtValue(); return CurDAG->getTargetConstant( - (V & 0x1) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE, + (V & 0x2) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE, SDLoc(N), MVT::i32); }]>; def gi_DstSelToOpSel3XForm : GICustomOperandRenderer<"renderDstSelToOpSel3XFormXForm">, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll index 788a9b251812..217c306a1ff9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll @@ -813,7 +813,7 @@ define i32 @test_cvt_scale_fp4_f32_byte1(i32 %old, float %src0, float %src1, flo ; GCN-LABEL: test_cvt_scale_fp4_f32_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 1) ret i32 %ret @@ -823,7 +823,7 @@ define i32 @test_cvt_scale_fp4_f32_byte2(i32 %old, float %src0, float %src1, flo ; GCN-LABEL: test_cvt_scale_fp4_f32_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 2) ret i32 %ret @@ -1302,7 +1302,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte1(<2 x half> %src0, float %scale, i32 ; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,0,1] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,0] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1314,7 +1314,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte2(<2 x half> %src0, float %scale, i32 ; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1380,7 +1380,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte1(<2 x bfloat> %src0, float %scale, i ; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,0,1] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,0] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1392,7 +1392,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte2(<2 x bfloat> %src0, float %scale, i ; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2072,7 +2072,7 @@ define i32 @test_cvt_scale_fp4_f32_byte1_inreg_src(i32 %old, float inreg %src0, ; GCN-LABEL: test_cvt_scale_fp4_f32_byte1_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,0,1] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 1) ret i32 %ret @@ -2082,7 +2082,7 @@ define i32 @test_cvt_scale_fp4_f32_byte2_inreg_src(i32 %old, float inreg %src0, ; GCN-LABEL: test_cvt_scale_fp4_f32_byte2_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,0,1] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 2) ret i32 %ret @@ -2515,7 +2515,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte1_inreg_src(<2 x half> inreg %src0, fl ; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte1_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,0,1] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,0] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2527,7 +2527,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte2_inreg_src(<2 x half> inreg %src0, fl ; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte2_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2562,7 +2562,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte1_inreg_src(<2 x bfloat> inreg %src0, ; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte1_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,0,1] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,0] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2574,7 +2574,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte2_inreg_src(<2 x bfloat> inreg %src0, ; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte2_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll index 8f15004366dd..fec30ee18eb5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll @@ -28,7 +28,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_bf16_dst_sel_1(ptr addrspace(1) ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -42,7 +42,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_bf16_dst_sel_2(ptr addrspace(1) ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -84,7 +84,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f16_dst_sel_1(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -98,7 +98,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f16_dst_sel_2(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -140,7 +140,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f32_dst_sel_1(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -154,7 +154,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f32_dst_sel_2(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -196,7 +196,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_bf16_dst_sel_1(ptr addrspace(1) ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -210,7 +210,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_bf16_dst_sel_2(ptr addrspace(1) ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -252,7 +252,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f16_dst_sel_1(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -266,7 +266,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f16_dst_sel_2(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -308,7 +308,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f32_dst_sel_1(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -322,7 +322,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f32_dst_sel_2(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll index 80e08a915e36..ea887a252514 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll @@ -25,7 +25,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_1(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -39,7 +39,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_2(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -81,7 +81,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_1(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -95,7 +95,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_2(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -137,7 +137,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_1(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v6, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v7, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -151,7 +151,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_2(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v6, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v7, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4