From e8e816344e40353900c0f7f64ba1458184fdfad1 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 21 Aug 2025 12:24:40 +0900 Subject: [PATCH] AMDGPU: Allow folding multiple uses of some immediates into copies In some cases this will require an avoidable re-defining of a register, but it works out better most of the time. Also allow folding 64-bit immediates into subregister extracts, unless it would break an inline constant. We could be more aggressive here, but this set of conditions seems to do a reasonable job without introducing too many regressions. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 27 +- .../GlobalISel/llvm.amdgcn.interp.inreg.ll | 12 +- .../CodeGen/AMDGPU/GlobalISel/mubuf-global.ll | 20 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 26 +- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 52 +- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 17 +- llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll | 5 +- .../AMDGPU/agpr-copy-no-free-registers.ll | 50 +- .../AMDGPU/amdgpu-codegenprepare-idiv.ll | 22 +- .../atomic_optimizations_local_pointer.ll | 14 +- .../branch-folding-implicit-def-subreg.ll | 356 +++--- ...dagcomb-extract-vec-elt-different-sizes.ll | 36 +- .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll | 112 +- llvm/test/CodeGen/AMDGPU/div_i128.ll | 112 +- llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 1112 ++++++++--------- .../divergent-branch-uniform-condition.ll | 28 +- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 2 +- .../CodeGen/AMDGPU/extract_vector_elt-i16.ll | 8 +- llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll | 49 +- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 390 +++--- llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll | 80 +- .../identical-subrange-spill-infloop.ll | 11 +- llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll | 2 +- ...llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll | 2 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 8 +- llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | 95 +- llvm/test/CodeGen/AMDGPU/mad-combine.ll | 18 +- .../CodeGen/AMDGPU/masked-load-vectortypes.ll | 2 +- llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 2 +- .../AMDGPU/peephole-fold-imm-multi-use.mir | 94 ++ llvm/test/CodeGen/AMDGPU/rem_i128.ll | 232 ++-- llvm/test/CodeGen/AMDGPU/roundeven.ll | 12 +- llvm/test/CodeGen/AMDGPU/rsq.f64.ll | 352 +++--- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 2 +- .../test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll | 6 +- llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 42 +- llvm/test/CodeGen/AMDGPU/spill-agpr.ll | 232 ++-- llvm/test/CodeGen/AMDGPU/srem64.ll | 2 +- llvm/test/CodeGen/AMDGPU/srl.ll | 2 +- .../CodeGen/AMDGPU/subreg-coalescer-crash.ll | 2 +- llvm/test/CodeGen/AMDGPU/udiv64.ll | 2 +- llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 42 +- .../AMDGPU/undef-handling-crash-in-ra.ll | 40 +- llvm/test/CodeGen/AMDGPU/urem64.ll | 96 +- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 2 +- llvm/test/CodeGen/AMDGPU/valu-i1.ll | 2 +- 46 files changed, 1997 insertions(+), 1835 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/peephole-fold-imm-multi-use.mir diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6f07e3df2d1a..73b7a994ba54 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3559,13 +3559,12 @@ static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) { bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const { - if (!MRI->hasOneNonDBGUse(Reg)) - return false; - int64_t Imm; if (!getConstValDefinedInReg(DefMI, Reg, Imm)) return false; + const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg); + assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form"); unsigned Opc = UseMI.getOpcode(); @@ -3577,6 +3576,25 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg); + if (HasMultipleUses) { + // TODO: This should fold in more cases with multiple use, but we need to + // more carefully consider what those uses are. + unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg)); + + // Avoid breaking up a 64-bit inline immediate into a subregister extract. + if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64) + return false; + + // Most of the time folding a 32-bit inline constant is free (though this + // might not be true if we can't later fold it into a real user). + // + // FIXME: This isInlineConstant check is imprecise if + // getConstValDefinedInReg handled the tricky non-mov cases. + if (ImmDefSize == 32 && + !isInlineConstant(Imm, AMDGPU::OPERAND_REG_IMM_INT32)) + return false; + } + bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister && RI.getSubRegIdxSize(UseSubReg) == 16; @@ -3664,6 +3682,9 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, return true; } + if (HasMultipleUses) + return false; + if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll index a09703285087..bd6634f25077 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll @@ -358,12 +358,12 @@ main_body: define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 { ; GFX11-TRUE16-LABEL: v_interp_f16_imm_params: ; GFX11-TRUE16: ; %bb.0: ; %main_body -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7 -; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7 +; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v2, v3 wait_exp:7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l @@ -383,12 +383,12 @@ define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) # ; ; GFX12-TRUE16-LABEL: v_interp_f16_imm_params: ; GFX12-TRUE16: ; %bb.0: ; %main_body -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7 -; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7 +; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v2, v3 wait_exp:7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll index 07d5ff2036d9..b75eb737534e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -1379,45 +1379,43 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in ; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX6-NEXT: v_lshl_b64 v[1:2], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, 2 +; GFX6-NEXT: v_mov_b32_e32 v0, 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-NEXT: v_mov_b32_e32 v0, 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: v_mov_b32_e32 v4, 2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo -; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_mov_b32_e32 v2, 2 +; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 832f066adaa8..2f956d7a0a53 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -229,21 +229,23 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 +; GFX6-NEXT: v_min_i32_e32 v6, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v7, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v7, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 -; GFX6-NEXT: v_max_i32_e32 v1, v5, v1 +; GFX6-NEXT: v_max_i32_e32 v1, v6, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v5, -2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -2951,20 +2953,22 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: saddsat_v2i16_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 +; GFX6-NEXT: v_min_i32_e32 v4, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 -; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 +; GFX6-NEXT: v_max_i32_e32 v4, s0, v4 +; GFX6-NEXT: v_min_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 -; GFX6-NEXT: v_min_i32_e32 v3, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_lshl_b32 s0, s1, 16 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_min_i32_e32 v3, 0, v1 +; GFX6-NEXT: s_lshl_b32 s0, s1, 16 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 8d8eca162257..19dc20c51004 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -1067,24 +1067,24 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] -; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, vcc, v9, v1, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v9, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 +; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, 0x1000, v4 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc @@ -1660,24 +1660,24 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] -; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, vcc, v9, v1, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v9, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 +; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, 0x12d8fb, v4 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 2673ac4fb5ba..c1b225562b77 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -233,16 +233,17 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 -; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GFX6-NEXT: v_min_i32_e32 v6, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v7, 1 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 -; GFX6-NEXT: v_min_i32_e32 v1, v1, v5 +; GFX6-NEXT: v_min_i32_e32 v1, v1, v6 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x80000001 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000001, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 @@ -1260,7 +1261,8 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 @@ -1279,7 +1281,8 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000001, v4 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6 ; GFX8-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll index 4b6375cc6080..153898560fc3 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll @@ -74,12 +74,13 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa ; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v2, 20, v2 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo -; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v3 scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: s_endpgm %stof = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %ptr) diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 3160e38df5e3..c226dae3d64a 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -521,14 +521,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX908-NEXT: s_sub_i32 s1, 0, s7 -; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s0 -; GFX908-NEXT: v_mov_b32_e32 v19, 0 -; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: v_mov_b32_e32 v1, 0 -; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX908-NEXT: v_readfirstlane_b32 s2, v2 +; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s0 +; GFX908-NEXT: v_mov_b32_e32 v17, 0 +; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX908-NEXT: v_readfirstlane_b32 s2, v0 ; GFX908-NEXT: s_mul_i32 s1, s1, s2 ; GFX908-NEXT: s_mul_hi_u32 s1, s2, s1 ; GFX908-NEXT: s_add_i32 s2, s2, s1 @@ -544,12 +542,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_cmp_ge_u32 s2, s7 ; GFX908-NEXT: s_cselect_b32 s8, s3, s1 ; GFX908-NEXT: s_lshr_b32 s2, s0, 16 -; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s2 +; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s2 ; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 ; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 +; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1] ; GFX908-NEXT: s_or_b32 s14, s14, 28 ; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s2, v16 ; GFX908-NEXT: s_and_b32 s2, 0xffff, s2 @@ -613,15 +613,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: s_add_u32 s22, s20, s9 ; GFX908-NEXT: s_addc_u32 s23, s21, s13 -; GFX908-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc +; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc +; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:-8 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v19, s[22:23] offset:-4 glc +; GFX908-NEXT: global_load_dword v12, v17, s[22:23] offset:-4 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v19, s[22:23] glc +; GFX908-NEXT: global_load_dword v12, v17, s[22:23] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: ds_read_b64 v[12:13], v19 +; GFX908-NEXT: ds_read_b64 v[12:13], v17 ; GFX908-NEXT: ds_read_b64 v[14:15], v0 ; GFX908-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) @@ -632,8 +632,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX908-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX908-NEXT: v_add_f32_e32 v24, v17, v12 -; GFX908-NEXT: v_add_f32_e32 v25, v18, v13 +; GFX908-NEXT: v_add_f32_e32 v24, v18, v12 +; GFX908-NEXT: v_add_f32_e32 v25, v19, v13 ; GFX908-NEXT: v_add_f32_e32 v26, 0, v12 ; GFX908-NEXT: v_add_f32_e32 v27, 0, v13 ; GFX908-NEXT: v_add_f32_e32 v15, v22, v15 @@ -688,12 +688,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX90A-NEXT: s_sub_i32 s1, 0, s7 ; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX90A-NEXT: v_readfirstlane_b32 s2, v3 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: s_mul_i32 s1, s1, s2 ; GFX90A-NEXT: s_mul_hi_u32 s1, s2, s1 ; GFX90A-NEXT: s_add_i32 s2, s2, s1 @@ -709,7 +709,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_cmp_ge_u32 s2, s7 ; GFX90A-NEXT: s_cselect_b32 s8, s3, s1 ; GFX90A-NEXT: s_lshr_b32 s2, s0, 16 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s2 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 ; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 ; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1] @@ -736,7 +736,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_cbranch_vccz .LBB3_10 ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 ; GFX90A-NEXT: s_mov_b32 s13, s12 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3] @@ -794,7 +794,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_cvt_f32_f16_e32 v22, v21 ; GFX90A-NEXT: v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX90A-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[2:3], v[14:15] +; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[0:1], v[14:15] ; GFX90A-NEXT: v_pk_add_f32 v[26:27], v[14:15], 0 op_sel_hi:[1,0] ; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17] ; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index c7385e4324e2..b2dcd7727498 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -10032,20 +10032,20 @@ define i64 @udiv_i64_gt_smax(i8 %size) { ; GFX9-LABEL: udiv_i64_gt_smax: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 31 -; GFX9-NEXT: v_not_b32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-NEXT: v_not_b32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; GFX9-NEXT: s_mov_b32 s4, 0xcccccccd -; GFX9-NEXT: v_ashrrev_i32_sdwa v1, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, s4 -; GFX9-NEXT: v_not_b32_e32 v5, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mul_hi_u32 v1, v5, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, 31 +; GFX9-NEXT: v_ashrrev_i32_sdwa v0, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_not_b32_e32 v6, v0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, s4, v[1:2] ; GFX9-NEXT: s_mov_b32 s6, 0xcccccccc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, s4, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, s6, v[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[1:2] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v1 ; GFX9-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, s6, v[0:1] ; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 17737cccec7c..23c5f4f5506f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -14614,8 +14614,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] ; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -14645,8 +14645,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -14677,8 +14677,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -14711,8 +14711,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc @@ -14742,8 +14742,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo @@ -14774,8 +14774,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc @@ -14805,8 +14805,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo +; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 006fe51a32c7..12f8a59f0b84 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -25,15 +25,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr5 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4 = DS_READ_B32_gfx9 renamable $vgpr5, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr28_sgpr29, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.2, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -41,10 +41,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46, $sgpr47, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46, $sgpr47, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr3 = IMPLICIT_DEF implicit-def $vgpr2 ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 ; GFX90A-NEXT: renamable $vgpr23 = IMPLICIT_DEF implicit-def $vgpr22 ; GFX90A-NEXT: renamable $vgpr25 = IMPLICIT_DEF implicit-def $vgpr24 @@ -52,22 +52,21 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc - ; GFX90A-NEXT: renamable $vgpr15 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr17 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.57, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec - ; GFX90A-NEXT: renamable $vgpr4 = COPY renamable $sgpr25, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr25, implicit $exec ; GFX90A-NEXT: renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr24, $vgpr0, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr4, killed $vgpr1, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr2, killed $vgpr1, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 2, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr40, renamable $vcc = V_ADD_CO_U32_e64 $vgpr46, killed $vgpr0, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr41, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr47, killed $vcc, 0, implicit $exec @@ -76,7 +75,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 @@ -89,9 +88,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF @@ -99,31 +98,30 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.Flow20: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr19 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr18 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr21 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr20 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr23 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr22 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr25 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr24 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr25 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr21 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr23 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec @@ -131,7 +129,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.8.Flow32: ; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr18_sgpr19, implicit-def $exec, implicit-def $scc, implicit $exec @@ -140,15 +138,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.9.bb89: ; GFX90A-NEXT: successors: %bb.10(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec @@ -157,15 +155,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.11.bb84: ; GFX90A-NEXT: successors: %bb.12(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec @@ -174,10 +172,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: @@ -359,7 +357,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i23) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec @@ -376,36 +374,36 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.36.Flow21: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr44_sgpr45, $sgpr42_sgpr43, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr44_sgpr45, $sgpr42_sgpr43, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i30) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec @@ -416,28 +414,28 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.38.Flow22: ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -458,7 +456,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr66_sgpr67 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr66_sgpr67 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i37) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec @@ -467,27 +465,27 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.40.Flow23: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -507,7 +505,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -516,31 +514,31 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr18, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.46, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc - ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec + ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr3, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc @@ -557,7 +555,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr17, 16, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc @@ -569,26 +567,26 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.44: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr57, $vgpr56, $vgpr18, $vgpr30, $vgpr31, $vgpr60, $vgpr62, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $vgpr61, $vgpr58, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr3, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr63 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr57, $vgpr56, $vgpr30, $vgpr31, $vgpr60, $vgpr62, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $vgpr61, $vgpr58, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr4, $vgpr5, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.45.Flow26: ; GFX90A-NEXT: successors: %bb.47(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc @@ -604,7 +602,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -618,25 +616,25 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.47.Flow25: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -654,21 +652,21 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.49: ; GFX90A-NEXT: successors: %bb.44(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: S_BRANCH %bb.44 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec @@ -677,108 +675,115 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.51: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc - ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr52_sgpr53 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr9, dead renamable $sgpr52_sgpr53 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: S_BRANCH %bb.61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr6 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76) - ; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr8 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76) + ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec + ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr8, implicit $exec ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.52, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.55.Flow29: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr46, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_opsel_e64 0, killed $sgpr47, 0, killed $vgpr10, 0, 1, 0, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr17, 0, $vgpr16, 0, 1, 0, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr15, 0, $vgpr14, 0, 1, 0, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr12, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $sgpr21, implicit $exec + ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr12, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $sgpr22, implicit $exec + ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr12, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $sgpr46, implicit $exec + ; GFX90A-NEXT: renamable $vgpr13 = V_ALIGNBIT_B32_opsel_e64 0, killed $sgpr47, 0, killed $vgpr12, 0, 1, 0, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr19, 0, $vgpr18, 0, 1, 0, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr19 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr17 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr17, 0, $vgpr16, 0, 1, 0, 0, implicit $exec ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $vgpr16, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.57: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $vgpr19 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr18 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr14 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr53 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 @@ -789,9 +794,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF @@ -801,27 +806,22 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr40_vgpr41 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr46_vgpr47 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr16 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr53 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr17 = COPY renamable $vgpr5, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.7 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.58.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr33, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 @@ -829,35 +829,35 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec - ; GFX90A-NEXT: renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) + ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 1, $vgpr8, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = COPY renamable $vgpr9, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12 = FLAT_LOAD_UBYTE renamable $vgpr10_vgpr11, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr12, implicit $exec ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: $sgpr54_sgpr55 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.56, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.60.Flow31: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.61.Flow30: ; GFX90A-NEXT: successors: %bb.55(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc @@ -869,7 +869,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.62.bb140: ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -877,14 +877,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.63.Flow13: ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.64.bb159: ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec @@ -893,48 +893,48 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.65.Flow10: ; GFX90A-NEXT: successors: %bb.66(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.66.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = COPY $exec ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.67.bb161: ; GFX90A-NEXT: successors: %bb.65(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr13, killed $vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4 = V_OR_B32_e32 killed $vgpr52, killed $vgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr4, killed $vgpr2, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr19, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr17, implicit $exec ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) ; GFX90A-NEXT: S_BRANCH %bb.65 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb174: ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr20, implicit $exec ; GFX90A-NEXT: renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec - ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr10, implicit $exec - ; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 $vgpr36, $vgpr12, implicit $exec + ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr2, implicit $exec + ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr12, implicit $exec + ; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 $vgpr36, $vgpr14, implicit $exec ; GFX90A-NEXT: renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc @@ -942,19 +942,19 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.69.Flow: ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.70.bb186: ; GFX90A-NEXT: successors: %bb.71(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = V_LSHLREV_B64_e64 3, killed $vgpr4_vgpr5, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr4, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr2, killed $vgpr5, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr27 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec @@ -964,31 +964,31 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr33 = COPY renamable $vgpr27, implicit $exec ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3) ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr10, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr27, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.71.Flow9: ; GFX90A-NEXT: successors: %bb.63(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.72.bb196: ; GFX90A-NEXT: successors: %bb.69(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec - ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr50, killed $vgpr18, implicit $exec + ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 killed $vgpr2, killed $vgpr16, implicit $exec ; GFX90A-NEXT: renamable $vgpr55 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 0 diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll index fc17d9288bf4..9f12977a3efd 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll @@ -17,14 +17,14 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr ; CHECK-NEXT: ; %bb.1: ; %bb10 ; CHECK-NEXT: global_load_dwordx2 v[8:9], v0, s[12:13] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v7, 0xff, v8 -; CHECK-NEXT: v_bfe_u32 v6, v8, 8, 8 -; CHECK-NEXT: v_bfe_u32 v5, v8, 16, 8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8 -; CHECK-NEXT: v_and_b32_e32 v3, 0xff, v9 -; CHECK-NEXT: v_bfe_u32 v2, v9, 8, 8 -; CHECK-NEXT: v_bfe_u32 v1, v9, 16, 8 -; CHECK-NEXT: v_lshrrev_b32_e32 v0, 24, v9 +; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v8 +; CHECK-NEXT: v_bfe_u32 v1, v8, 8, 8 +; CHECK-NEXT: v_bfe_u32 v2, v8, 16, 8 +; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 +; CHECK-NEXT: v_and_b32_e32 v4, 0xff, v9 +; CHECK-NEXT: v_bfe_u32 v5, v9, 8, 8 +; CHECK-NEXT: v_bfe_u32 v7, v9, 16, 8 +; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v9 ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: v_mov_b32_e32 v1, 0 @@ -32,8 +32,8 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: v_mov_b32_e32 v5, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: .LBB0_3: ; %bb41 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x48 ; CHECK-NEXT: v_mov_b32_e32 v8, s14 @@ -50,16 +50,16 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr ; CHECK-NEXT: v_mov_b32_e32 v19, s25 ; CHECK-NEXT: v_mov_b32_e32 v20, s26 ; CHECK-NEXT: v_mov_b32_e32 v21, s27 -; CHECK-NEXT: flat_store_byte v[8:9], v7 -; CHECK-NEXT: flat_store_byte v[10:11], v6 -; CHECK-NEXT: flat_store_byte v[12:13], v5 -; CHECK-NEXT: flat_store_byte v[14:15], v4 -; CHECK-NEXT: flat_store_byte v[16:17], v3 -; CHECK-NEXT: flat_store_byte v[18:19], v2 -; CHECK-NEXT: flat_store_byte v[20:21], v1 +; CHECK-NEXT: flat_store_byte v[8:9], v0 +; CHECK-NEXT: flat_store_byte v[10:11], v1 +; CHECK-NEXT: flat_store_byte v[12:13], v2 +; CHECK-NEXT: flat_store_byte v[14:15], v3 +; CHECK-NEXT: flat_store_byte v[16:17], v4 +; CHECK-NEXT: flat_store_byte v[18:19], v5 +; CHECK-NEXT: flat_store_byte v[20:21], v7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] -; CHECK-NEXT: flat_store_byte v[2:3], v0 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; CHECK-NEXT: flat_store_byte v[0:1], v6 ; CHECK-NEXT: s_endpgm bb: br i1 %arg, label %bb10, label %bb41 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index f26b72027a78..59837bc718b7 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -1280,25 +1280,45 @@ define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2) } define double @fmul_select_f64_test12(double %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f64_test12: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f64_test12: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, 0, vcc +; GFX7-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: fmul_select_f64_test12: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX7-GISEL-LABEL: fmul_select_f64_test12: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX7-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f64_test12: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, 0, vcc +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f64_test12: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_select_f64_test12: ; GFX10: ; %bb.0: @@ -1325,25 +1345,45 @@ define double @fmul_select_f64_test12(double %x, i32 %bool.arg1, i32 %bool.arg2) } define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f64_test13: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, 0x40300000 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f64_test13: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0x40300000 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, 0, vcc +; GFX7-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: fmul_select_f64_test13: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0x40300000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX7-GISEL-LABEL: fmul_select_f64_test13: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, 0x40300000 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX7-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f64_test13: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x40300000 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, 0, vcc +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f64_test13: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x40300000 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_select_f64_test13: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 747affa92860..b60061589d09 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -147,8 +147,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_mov_b32_e32 v15, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -385,31 +385,31 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: s_mov_b32 s13, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s13 +; GFX9-O0-NEXT: s_mov_b32 s12, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 ; GFX9-O0-NEXT: v_min_u32_e64 v7, v7, v8 -; GFX9-O0-NEXT: s_mov_b32 s12, 0 -; GFX9-O0-NEXT: ; implicit-def: $sgpr14 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s12 +; GFX9-O0-NEXT: s_mov_b32 s13, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr13 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s13 +; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 ; GFX9-O0-NEXT: v_min_u32_e64 v12, v6, v9 -; GFX9-O0-NEXT: ; implicit-def: $sgpr14 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr13 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 ; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 ; GFX9-O0-NEXT: s_mov_b32 s16, s14 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-O0-NEXT: s_mov_b32 s18, s15 +; GFX9-O0-NEXT: s_mov_b32 s13, s15 ; GFX9-O0-NEXT: v_add_co_u32_e64 v9, s[16:17], v9, s16 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s18 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s13 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[16:17], v6, v10, s[16:17] ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 @@ -425,20 +425,20 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr16 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr13 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 ; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr13 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 @@ -543,17 +543,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 4 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-O0-NEXT: s_branch .LBB0_8 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 @@ -585,9 +585,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 @@ -645,9 +645,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_3 ; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 @@ -676,9 +676,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 @@ -870,9 +870,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 10 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 11 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill @@ -901,9 +901,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 ; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload @@ -1003,9 +1003,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill @@ -1032,9 +1032,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -1161,9 +1161,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 @@ -2427,8 +2427,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[4:5] ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v7, vcc ; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mov_b32_e32 v17, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v17, 0 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2566,31 +2566,31 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: s_mov_b32 s9, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: s_mov_b32 s8, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 -; GFX9-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 ; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_mov_b32 s12, s10 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 -; GFX9-O0-NEXT: s_mov_b32 s14, s11 +; GFX9-O0-NEXT: s_mov_b32 s9, s11 ; GFX9-O0-NEXT: v_add_co_u32_e64 v7, s[12:13], v7, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s9 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v8, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 @@ -2601,25 +2601,25 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v5, v6, s[12:13] -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 ; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr9 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 7ea98a16e3b8..5134159e3e40 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -60,25 +60,25 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8 ; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7] ; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v8, 0, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[6:7] ; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v9, vcc -; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v2 +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v2, v10 +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v9, vcc +; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v8 ; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] ; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc -; SDAG-NEXT: v_or_b32_e32 v8, v8, v10 -; SDAG-NEXT: v_or_b32_e32 v9, v3, v11 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v10 +; SDAG-NEXT: v_or_b32_e32 v3, v9, v11 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v19, s[4:5] -; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v19, s[4:5] +; SDAG-NEXT: v_and_b32_e32 v2, 1, v2 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v17, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 @@ -89,88 +89,88 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v2 -; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v2 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 -; SDAG-NEXT: v_mov_b32_e32 v9, 0 -; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v8 +; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v8 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v9, vcc ; SDAG-NEXT: v_lshl_b64 v[18:19], v[20:21], v18 ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc -; SDAG-NEXT: v_or_b32_e32 v10, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v2 -; SDAG-NEXT: v_or_b32_e32 v11, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], v34 -; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[20:21], v34 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_lshr_b64 v[10:11], v[20:21], v35 -; SDAG-NEXT: v_or_b32_e32 v3, v3, v11 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v10 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v23, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v22, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v9, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v11, vcc, 0x7f, v8 +; SDAG-NEXT: v_or_b32_e32 v10, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v11 +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 64, v11 +; SDAG-NEXT: v_lshl_b64 v[34:35], v[20:21], v11 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] +; SDAG-NEXT: v_lshr_b64 v[8:9], v[20:21], v8 +; SDAG-NEXT: v_or_b32_e32 v9, v23, v9 +; SDAG-NEXT: v_or_b32_e32 v8, v22, v8 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v11 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v19, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v35, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v34, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v11, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[8:9], v[20:21], v30 +; SDAG-NEXT: v_lshr_b64 v[2:3], v[20:21], v30 ; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v30 ; SDAG-NEXT: v_lshl_b64 v[10:11], v[16:17], v10 -; SDAG-NEXT: v_or_b32_e32 v11, v9, v11 -; SDAG-NEXT: v_or_b32_e32 v10, v8, v10 +; SDAG-NEXT: v_or_b32_e32 v11, v3, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v2, v10 ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 -; SDAG-NEXT: v_subrev_i32_e64 v8, s[4:5], 64, v30 -; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v8 -; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; SDAG-NEXT: v_subrev_i32_e64 v2, s[4:5], 64, v30 +; SDAG-NEXT: v_lshr_b64 v[2:3], v[16:17], v2 +; SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v21, v9, v21, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v20, v8, v20, s[4:5] -; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v30 -; SDAG-NEXT: v_cndmask_b32_e32 v23, 0, v9, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v22, 0, v8, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v21, v3, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v20, v2, v20, s[4:5] +; SDAG-NEXT: v_lshr_b64 v[2:3], v[16:17], v30 +; SDAG-NEXT: v_cndmask_b32_e32 v23, 0, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v22, 0, v2, vcc ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0 -; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v19 +; SDAG-NEXT: v_lshrrev_b32_e32 v2, 31, v19 ; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v3 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v9 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; SDAG-NEXT: v_or_b32_e32 v19, v17, v19 ; SDAG-NEXT: v_or_b32_e32 v18, v16, v18 ; SDAG-NEXT: v_or_b32_e32 v16, v22, v38 ; SDAG-NEXT: v_or_b32_e32 v17, v20, v39 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v8 -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v17 -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v21, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v16, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v23, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; SDAG-NEXT: v_and_b32_e32 v20, v8, v29 -; SDAG-NEXT: v_and_b32_e32 v22, v8, v28 -; SDAG-NEXT: v_and_b32_e32 v38, v8, v0 -; SDAG-NEXT: v_and_b32_e32 v39, v8, v1 -; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v34, v17 +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v35, v21, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v36, v16, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v37, v23, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; SDAG-NEXT: v_and_b32_e32 v20, v2, v29 +; SDAG-NEXT: v_and_b32_e32 v22, v2, v28 +; SDAG-NEXT: v_and_b32_e32 v38, v2, v0 +; SDAG-NEXT: v_and_b32_e32 v39, v2, v1 +; SDAG-NEXT: v_and_b32_e32 v2, 1, v2 ; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v17, v20 ; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v21, v22, vcc ; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v16, v38, vcc @@ -182,243 +182,243 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v16, v30, v32 ; SDAG-NEXT: v_or_b32_e32 v17, v31, v33 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_or_b32_e32 v3, v11, v3 +; SDAG-NEXT: v_or_b32_e32 v9, v11, v9 ; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v2, v10, v2 -; SDAG-NEXT: v_mov_b32_e32 v17, v9 -; SDAG-NEXT: v_mov_b32_e32 v16, v8 +; SDAG-NEXT: v_or_b32_e32 v8, v10, v8 +; SDAG-NEXT: v_mov_b32_e32 v17, v3 +; SDAG-NEXT: v_mov_b32_e32 v16, v2 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SDAG-NEXT: s_cbranch_execnz .LBB0_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB0_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v19 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[18:19], 1 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[18:19], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v16 ; SDAG-NEXT: v_or_b32_e32 v18, v11, v1 -; SDAG-NEXT: v_or_b32_e32 v19, v9, v3 +; SDAG-NEXT: v_or_b32_e32 v19, v3, v9 ; SDAG-NEXT: v_or_b32_e32 v22, v10, v0 -; SDAG-NEXT: v_or_b32_e32 v23, v8, v2 +; SDAG-NEXT: v_or_b32_e32 v23, v2, v8 ; SDAG-NEXT: .LBB0_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v7 ; SDAG-NEXT: v_ashrrev_i32_e32 v17, 31, v15 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v20, v16 ; SDAG-NEXT: v_mov_b32_e32 v21, v17 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v5, vcc -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v7, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v1, v2 ; SDAG-NEXT: v_ffbh_u32_e32 v4, v3 -; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v0, s[4:5] ; SDAG-NEXT: v_sub_i32_e32 v5, vcc, 0, v12 -; SDAG-NEXT: v_or_b32_e32 v0, v2, v6 -; SDAG-NEXT: v_ffbh_u32_e32 v9, v6 -; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v1 -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v1, v3, v7 -; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 32, v9 -; SDAG-NEXT: v_ffbh_u32_e32 v30, v7 -; SDAG-NEXT: v_min_u32_e32 v4, v10, v4 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v14, vcc +; SDAG-NEXT: v_or_b32_e32 v0, v2, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v8 +; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], 32, v1 +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v13, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v3, v9 +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], 32, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v30, v9 +; SDAG-NEXT: v_min_u32_e32 v4, v7, v4 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, 0, v14, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] -; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v10, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v5, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_min_u32_e32 v1, v9, v30 +; SDAG-NEXT: v_min_u32_e32 v1, v6, v30 ; SDAG-NEXT: v_add_i32_e64 v4, s[8:9], 64, v4 ; SDAG-NEXT: v_addc_u32_e64 v5, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v15, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v10, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v10, v29 -; SDAG-NEXT: v_ffbh_u32_e32 v11, v28 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, 0, v15, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v7, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v7, v29 +; SDAG-NEXT: v_ffbh_u32_e32 v10, v28 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; SDAG-NEXT: v_cndmask_b32_e64 v12, v5, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v13, v4, v1, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v6, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v4, v29, v0 -; SDAG-NEXT: v_ffbh_u32_e32 v9, v0 -; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v0 +; SDAG-NEXT: v_add_i32_e32 v7, vcc, 32, v7 ; SDAG-NEXT: v_or_b32_e32 v5, v28, v1 -; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v9 +; SDAG-NEXT: v_add_i32_e32 v6, vcc, 32, v6 ; SDAG-NEXT: v_ffbh_u32_e32 v14, v1 -; SDAG-NEXT: v_min_u32_e32 v10, v10, v11 +; SDAG-NEXT: v_min_u32_e32 v7, v7, v10 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: v_min_u32_e32 v4, v9, v14 -; SDAG-NEXT: v_add_i32_e64 v5, s[4:5], 64, v10 -; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v4, v6, v14 +; SDAG-NEXT: v_add_i32_e64 v5, s[4:5], 64, v7 +; SDAG-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v7, v6, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v13 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v9, v12, vcc -; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v4 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v8, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5] +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v4, v13 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v12, vcc +; SDAG-NEXT: v_xor_b32_e32 v4, 0x7f, v6 +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v11, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc -; SDAG-NEXT: v_or_b32_e32 v8, v9, v10 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v11, vcc +; SDAG-NEXT: v_or_b32_e32 v4, v4, v10 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v9, v5, v11 +; SDAG-NEXT: v_or_b32_e32 v5, v7, v11 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_and_b32_e32 v8, 1, v12 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_and_b32_e32 v4, 1, v12 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v4 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v13, v7, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v9, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v5, v8, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v2, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v4 -; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v4 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 -; SDAG-NEXT: v_mov_b32_e32 v9, 0 -; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v5, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6 +; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6 +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v7, vcc ; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12 ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc ; SDAG-NEXT: v_or_b32_e32 v10, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v4 +; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v6 ; SDAG-NEXT: v_or_b32_e32 v11, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[4:5], v[6:7], v34 +; SDAG-NEXT: v_lshl_b64 v[6:7], v[8:9], v34 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 ; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v34 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v35 -; SDAG-NEXT: v_or_b32_e32 v5, v5, v11 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v10 +; SDAG-NEXT: v_or_b32_e32 v7, v7, v11 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v10 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[8:9], v[2:3], v30 -; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 -; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 -; SDAG-NEXT: v_lshr_b64 v[37:38], v[6:7], v30 +; SDAG-NEXT: v_lshr_b64 v[14:15], v[2:3], v30 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, 64, v30 +; SDAG-NEXT: v_subrev_i32_e32 v35, vcc, 64, v30 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[8:9], v30 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29 ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, 0 -; SDAG-NEXT: v_mov_b32_e32 v15, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_lshl_b64 v[48:49], v[6:7], v35 -; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v36 +; SDAG-NEXT: v_mov_b32_e32 v5, 0 +; SDAG-NEXT: v_lshl_b64 v[48:49], v[8:9], v4 +; SDAG-NEXT: v_lshr_b64 v[8:9], v[8:9], v35 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v9, v9, v49 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v48 +; SDAG-NEXT: v_or_b32_e32 v4, v15, v49 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v48 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v38, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v37, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v15, 0, v38, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, 0, v37, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SDAG-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v9, v4, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 ; SDAG-NEXT: .LBB0_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v9 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v7 ; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v3 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v5 -; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v11 ; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v6, v6, v8 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v38 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v39 -; SDAG-NEXT: v_or_b32_e32 v5, v13, v5 -; SDAG-NEXT: v_or_b32_e32 v11, v15, v11 -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v2 -; SDAG-NEXT: v_or_b32_e32 v4, v12, v4 -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v3, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v6, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v7, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; SDAG-NEXT: v_and_b32_e32 v15, v8, v29 -; SDAG-NEXT: v_and_b32_e32 v38, v8, v28 -; SDAG-NEXT: v_and_b32_e32 v39, v8, v0 -; SDAG-NEXT: v_and_b32_e32 v48, v8, v1 -; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v15 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v38, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v39, vcc -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v48, vcc +; SDAG-NEXT: v_or_b32_e32 v4, v14, v4 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v38 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v39 +; SDAG-NEXT: v_or_b32_e32 v7, v13, v7 +; SDAG-NEXT: v_or_b32_e32 v11, v3, v11 +; SDAG-NEXT: v_sub_i32_e32 v3, vcc, v34, v8 +; SDAG-NEXT: v_or_b32_e32 v6, v12, v6 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v35, v9, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v36, v4, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v37, v15, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v3 +; SDAG-NEXT: v_and_b32_e32 v3, v38, v29 +; SDAG-NEXT: v_and_b32_e32 v14, v38, v28 +; SDAG-NEXT: v_and_b32_e32 v39, v38, v0 +; SDAG-NEXT: v_and_b32_e32 v48, v38, v1 +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v3 +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v14, vcc +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v4, v39, vcc +; SDAG-NEXT: v_subb_u32_e32 v15, vcc, v15, v48, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc -; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 -; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] -; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SDAG-NEXT: v_or_b32_e32 v3, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v4, v31, v33 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[3:4] +; SDAG-NEXT: v_and_b32_e32 v4, 1, v38 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v10, v14, v10 -; SDAG-NEXT: v_mov_b32_e32 v15, v9 -; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_or_b32_e32 v10, v2, v10 +; SDAG-NEXT: v_mov_b32_e32 v2, v4 +; SDAG-NEXT: v_mov_b32_e32 v3, v5 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB0_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB0_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v11 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v11 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v6 ; SDAG-NEXT: v_or_b32_e32 v13, v13, v1 -; SDAG-NEXT: v_or_b32_e32 v14, v9, v3 -; SDAG-NEXT: v_or_b32_e32 v9, v12, v0 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 +; SDAG-NEXT: v_or_b32_e32 v14, v5, v3 +; SDAG-NEXT: v_or_b32_e32 v5, v12, v0 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v2 ; SDAG-NEXT: .LBB0_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26 ; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24 ; SDAG-NEXT: v_xor_b32_e32 v7, v21, v20 ; SDAG-NEXT: v_xor_b32_e32 v6, v17, v16 -; SDAG-NEXT: v_xor_b32_e32 v4, v18, v3 -; SDAG-NEXT: v_xor_b32_e32 v5, v22, v2 +; SDAG-NEXT: v_xor_b32_e32 v8, v18, v3 +; SDAG-NEXT: v_xor_b32_e32 v9, v22, v2 ; SDAG-NEXT: v_xor_b32_e32 v1, v19, v3 ; SDAG-NEXT: v_xor_b32_e32 v0, v23, v2 ; SDAG-NEXT: v_xor_b32_e32 v10, v13, v7 -; SDAG-NEXT: v_xor_b32_e32 v9, v9, v6 -; SDAG-NEXT: v_xor_b32_e32 v11, v14, v7 +; SDAG-NEXT: v_xor_b32_e32 v11, v5, v6 +; SDAG-NEXT: v_xor_b32_e32 v5, v14, v7 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v5, v2, vcc -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc -; SDAG-NEXT: v_xor_b32_e32 v4, v8, v6 +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v9, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v3, vcc +; SDAG-NEXT: v_xor_b32_e32 v4, v4, v6 ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v11, v7, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v9, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v7, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v11, v6, vcc ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v10, v7, vcc ; SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -869,19 +869,19 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v22 -; SDAG-NEXT: v_subb_u32_e32 v24, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[22:23] +; SDAG-NEXT: v_sub_i32_e32 v21, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v21 +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v28, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[21:22] ; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 -; SDAG-NEXT: v_or_b32_e32 v17, v23, v25 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25] +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v16, v23 +; SDAG-NEXT: v_or_b32_e32 v17, v22, v24 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[23:24] ; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[24:25] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[23:24] ; SDAG-NEXT: v_cndmask_b32_e64 v16, v19, v18, s[4:5] ; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 @@ -895,118 +895,118 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB1_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v26, vcc, 1, v22 -; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v22 +; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v21 +; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v21 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v23, vcc +; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v22, vcc ; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 -; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v24, vcc -; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v25, vcc -; SDAG-NEXT: v_or_b32_e32 v18, v26, v28 -; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v22 -; SDAG-NEXT: v_or_b32_e32 v19, v27, v29 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v30 -; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v30 -; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v30 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v31 -; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 -; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v19, v17, v19, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v16, v18, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v25, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v24, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v2, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v23, vcc +; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v24, vcc +; SDAG-NEXT: v_or_b32_e32 v22, v18, v28 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v21 +; SDAG-NEXT: v_or_b32_e32 v23, v27, v29 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v26 +; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 64, v26 +; SDAG-NEXT: v_lshl_b64 v[30:31], v[0:1], v26 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23] +; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v21 +; SDAG-NEXT: v_or_b32_e32 v22, v25, v22 +; SDAG-NEXT: v_or_b32_e32 v21, v24, v21 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v22, v17, v22, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v16, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v30, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v21, v2, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_mov_b32_e32 v24, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v26 -; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 64, v26 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v22 -; SDAG-NEXT: v_or_b32_e32 v23, v21, v23 -; SDAG-NEXT: v_or_b32_e32 v22, v20, v22 -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 -; SDAG-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v26 -; SDAG-NEXT: v_lshr_b64 v[20:21], v[2:3], v20 -; SDAG-NEXT: v_cndmask_b32_e32 v21, v21, v23, vcc -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v21, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v0, v20, v0, s[4:5] -; SDAG-NEXT: v_lshr_b64 v[2:3], v[2:3], v26 +; SDAG-NEXT: v_lshr_b64 v[19:20], v[0:1], v18 +; SDAG-NEXT: v_sub_i32_e32 v23, vcc, 64, v18 +; SDAG-NEXT: v_lshl_b64 v[23:24], v[2:3], v23 +; SDAG-NEXT: v_or_b32_e32 v24, v20, v24 +; SDAG-NEXT: v_or_b32_e32 v23, v19, v23 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; SDAG-NEXT: v_subrev_i32_e64 v19, s[4:5], 64, v18 +; SDAG-NEXT: v_lshr_b64 v[19:20], v[2:3], v19 +; SDAG-NEXT: v_cndmask_b32_e32 v20, v20, v24, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v20, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v19, v19, v23, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, v19, v0, s[4:5] +; SDAG-NEXT: v_lshr_b64 v[2:3], v[2:3], v18 ; SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v8 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc -; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: v_mov_b32_e32 v24, 0 -; SDAG-NEXT: v_mov_b32_e32 v25, 0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v25, 0 +; SDAG-NEXT: v_mov_b32_e32 v26, 0 ; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v17 ; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 -; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 +; SDAG-NEXT: v_or_b32_e32 v17, v26, v17 +; SDAG-NEXT: v_or_b32_e32 v16, v25, v16 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v19, 31, v1 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v20 -; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v19 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v20 -; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v30, v0 -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v31, v1, vcc -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v32, v2, vcc -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v33, v3, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v20, 31, v20 -; SDAG-NEXT: v_and_b32_e32 v24, v20, v8 -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v24 -; SDAG-NEXT: v_and_b32_e32 v24, v20, v9 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v24, vcc -; SDAG-NEXT: v_and_b32_e32 v24, v20, v10 -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v24, vcc -; SDAG-NEXT: v_and_b32_e32 v24, v20, v11 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v24, vcc -; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v26 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v19 +; SDAG-NEXT: v_lshrrev_b32_e32 v19, 31, v22 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v19 +; SDAG-NEXT: v_sub_i32_e32 v19, vcc, v30, v0 +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v31, v1, vcc +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v32, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v33, v3, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v19, 31, v19 +; SDAG-NEXT: v_and_b32_e32 v25, v19, v8 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v25 +; SDAG-NEXT: v_and_b32_e32 v25, v19, v9 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v25, vcc +; SDAG-NEXT: v_and_b32_e32 v25, v19, v10 +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v25, vcc +; SDAG-NEXT: v_and_b32_e32 v25, v19, v11 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v25, vcc +; SDAG-NEXT: v_add_i32_e32 v18, vcc, -1, v18 ; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc ; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc ; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc -; SDAG-NEXT: v_or_b32_e32 v24, v26, v28 -; SDAG-NEXT: v_or_b32_e32 v25, v27, v29 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25] -; SDAG-NEXT: v_and_b32_e32 v20, 1, v20 -; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v34 -; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 +; SDAG-NEXT: v_or_b32_e32 v25, v18, v28 +; SDAG-NEXT: v_or_b32_e32 v26, v27, v29 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] +; SDAG-NEXT: v_and_b32_e32 v19, 1, v19 +; SDAG-NEXT: v_lshl_b64 v[21:22], v[21:22], 1 +; SDAG-NEXT: v_or_b32_e32 v21, v21, v34 +; SDAG-NEXT: v_or_b32_e32 v22, v24, v22 ; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 -; SDAG-NEXT: v_mov_b32_e32 v25, v21 -; SDAG-NEXT: v_mov_b32_e32 v24, v20 +; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 +; SDAG-NEXT: v_mov_b32_e32 v26, v20 +; SDAG-NEXT: v_mov_b32_e32 v25, v19 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SDAG-NEXT: s_cbranch_execnz .LBB1_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB1_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[18:19], 1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[21:22], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v17 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 -; SDAG-NEXT: v_or_b32_e32 v16, v23, v1 -; SDAG-NEXT: v_or_b32_e32 v18, v21, v3 -; SDAG-NEXT: v_or_b32_e32 v17, v22, v0 -; SDAG-NEXT: v_or_b32_e32 v19, v20, v2 +; SDAG-NEXT: v_or_b32_e32 v16, v24, v1 +; SDAG-NEXT: v_or_b32_e32 v18, v20, v3 +; SDAG-NEXT: v_or_b32_e32 v17, v23, v0 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v2 ; SDAG-NEXT: .LBB1_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_or_b32_e32 v1, v13, v15 @@ -1044,22 +1044,22 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc -; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v0 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v1, vcc +; SDAG-NEXT: v_xor_b32_e32 v0, 0x7f, v2 ; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v24, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v24, vcc -; SDAG-NEXT: v_or_b32_e32 v2, v2, v20 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v20 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v3, v1, v21 +; SDAG-NEXT: v_or_b32_e32 v1, v3, v21 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_and_b32_e32 v2, 1, v8 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_and_b32_e32 v0, 1, v8 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 @@ -1070,93 +1070,93 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v22, vcc, 1, v0 -; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v0 -; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc +; SDAG-NEXT: v_add_i32_e32 v22, vcc, 1, v2 +; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v3, vcc ; SDAG-NEXT: v_lshl_b64 v[8:9], v[4:5], v8 ; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v20, vcc ; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v21, vcc ; SDAG-NEXT: v_or_b32_e32 v10, v22, v24 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v0 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v2 ; SDAG-NEXT: v_or_b32_e32 v11, v23, v25 -; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], v26 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], v26 ; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v26 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v27 -; SDAG-NEXT: v_or_b32_e32 v1, v1, v11 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v10 +; SDAG-NEXT: v_or_b32_e32 v3, v3, v11 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v10 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v21, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v20, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[2:3], v[4:5], v22 -; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v22 -; SDAG-NEXT: v_subrev_i32_e32 v28, vcc, 64, v22 +; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v22 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 64, v22 +; SDAG-NEXT: v_subrev_i32_e32 v27, vcc, 64, v22 ; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v22 ; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v12 ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_mov_b32_e32 v10, 0 -; SDAG-NEXT: v_mov_b32_e32 v11, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_lshl_b64 v[31:32], v[6:7], v27 -; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v28 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: v_lshl_b64 v[31:32], v[6:7], v0 +; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v27 ; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v3, v3, v32 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v31 +; SDAG-NEXT: v_or_b32_e32 v0, v11, v32 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v31 ; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v30, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v29, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v7, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v30, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v29, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 -; SDAG-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v7, v0, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, 0 ; SDAG-NEXT: .LBB1_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v0, 31, v7 ; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v2, 31, v5 -; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v1 -; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v9 ; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; SDAG-NEXT: v_or_b32_e32 v6, v6, v2 -; SDAG-NEXT: v_or_b32_e32 v2, v4, v30 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v31 -; SDAG-NEXT: v_or_b32_e32 v1, v21, v1 -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v26, v2 -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v27, v5, vcc -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v28, v6, vcc -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v29, v7, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v30, 31, v4 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v0 +; SDAG-NEXT: v_or_b32_e32 v0, v6, v30 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v31 +; SDAG-NEXT: v_or_b32_e32 v3, v21, v3 +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v26, v0 +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v27, v7, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v28, v10, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v29, v11, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v30, 31, v6 ; SDAG-NEXT: v_and_b32_e32 v31, v30, v13 -; SDAG-NEXT: v_and_b32_e32 v4, v30, v12 -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v2, v4 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v31, vcc -; SDAG-NEXT: v_or_b32_e32 v9, v11, v9 -; SDAG-NEXT: v_or_b32_e32 v0, v20, v0 -; SDAG-NEXT: v_and_b32_e32 v2, 1, v30 -; SDAG-NEXT: v_and_b32_e32 v11, v30, v15 +; SDAG-NEXT: v_and_b32_e32 v6, v30, v12 +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v0, v6 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v31, vcc +; SDAG-NEXT: v_or_b32_e32 v9, v5, v9 +; SDAG-NEXT: v_or_b32_e32 v2, v20, v2 +; SDAG-NEXT: v_and_b32_e32 v0, 1, v30 +; SDAG-NEXT: v_and_b32_e32 v5, v30, v15 ; SDAG-NEXT: v_and_b32_e32 v30, v30, v14 -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v30, vcc -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v10, v30, vcc +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v5, vcc ; SDAG-NEXT: v_add_i32_e32 v22, vcc, -1, v22 ; SDAG-NEXT: v_addc_u32_e32 v23, vcc, -1, v23, vcc ; SDAG-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc @@ -1165,23 +1165,23 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v30, v22, v24 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31] ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v8, v10, v8 -; SDAG-NEXT: v_mov_b32_e32 v11, v3 -; SDAG-NEXT: v_mov_b32_e32 v10, v2 +; SDAG-NEXT: v_or_b32_e32 v8, v4, v8 +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB1_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB1_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v9 ; SDAG-NEXT: v_lshl_b64 v[4:5], v[8:9], 1 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v6 -; SDAG-NEXT: v_or_b32_e32 v8, v21, v1 -; SDAG-NEXT: v_or_b32_e32 v10, v3, v5 -; SDAG-NEXT: v_or_b32_e32 v9, v20, v0 -; SDAG-NEXT: v_or_b32_e32 v11, v2, v4 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v6 +; SDAG-NEXT: v_or_b32_e32 v8, v21, v3 +; SDAG-NEXT: v_or_b32_e32 v10, v1, v5 +; SDAG-NEXT: v_or_b32_e32 v9, v20, v2 +; SDAG-NEXT: v_or_b32_e32 v11, v0, v4 ; SDAG-NEXT: .LBB1_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mov_b32_e32 v0, v19 @@ -1674,32 +1674,32 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v32 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v32 +; SDAG-NEXT: v_lshr_b64 v[22:23], v[16:17], v32 +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 64, v32 ; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32 ; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32 ; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31 ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_mov_b32_e32 v22, 0 -; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v26 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v8 ; SDAG-NEXT: v_lshr_b64 v[48:49], v[0:1], v37 ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc -; SDAG-NEXT: v_or_b32_e32 v9, v9, v27 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v26 +; SDAG-NEXT: v_or_b32_e32 v8, v23, v27 +; SDAG-NEXT: v_or_b32_e32 v22, v22, v26 ; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v2, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v49, v9, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v48, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v49, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, v48, v22, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v3, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 -; SDAG-NEXT: v_cndmask_b32_e32 v25, v9, v17, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v24, v8, v16, vcc -; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v25, v8, v17, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v24, v22, v16, vcc +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21 @@ -1813,109 +1813,109 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc ; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v13, v12, vcc -; SDAG-NEXT: v_xor_b32_e32 v14, 0x7f, v10 -; SDAG-NEXT: v_subb_u32_e32 v12, vcc, 0, v18, vcc +; SDAG-NEXT: v_xor_b32_e32 v12, 0x7f, v10 +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, 0, v18, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v13, vcc, 0, v18, vcc -; SDAG-NEXT: v_or_b32_e32 v14, v14, v12 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] -; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v15, v11, v13 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] -; SDAG-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; SDAG-NEXT: v_subb_u32_e32 v15, vcc, 0, v18, vcc +; SDAG-NEXT: v_or_b32_e32 v12, v12, v14 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; SDAG-NEXT: v_and_b32_e32 v14, 1, v18 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v13, v11, v15 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; SDAG-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; SDAG-NEXT: v_and_b32_e32 v12, 1, v18 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v15, v9, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v14, v8, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v9, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v8, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v10 ; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v10 -; SDAG-NEXT: v_mov_b32_e32 v14, 0 -; SDAG-NEXT: v_mov_b32_e32 v15, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: v_addc_u32_e32 v39, vcc, 0, v11, vcc ; SDAG-NEXT: v_lshl_b64 v[18:19], v[8:9], v18 -; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v12, vcc -; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v11, v38, v48 -; SDAG-NEXT: v_sub_i32_e32 v13, vcc, 0x7f, v10 -; SDAG-NEXT: v_or_b32_e32 v12, v39, v49 -; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v13 -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v13 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[8:9], v13 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12] -; SDAG-NEXT: v_lshr_b64 v[10:11], v[8:9], v10 -; SDAG-NEXT: v_or_b32_e32 v11, v21, v11 -; SDAG-NEXT: v_or_b32_e32 v10, v20, v10 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v13 -; SDAG-NEXT: v_cndmask_b32_e64 v12, v19, v11, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v14, vcc +; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v15, vcc +; SDAG-NEXT: v_or_b32_e32 v14, v38, v48 +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 0x7f, v10 +; SDAG-NEXT: v_or_b32_e32 v15, v39, v49 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[4:5], v22 +; SDAG-NEXT: v_sub_i32_e32 v23, vcc, 64, v22 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[8:9], v22 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] +; SDAG-NEXT: v_lshr_b64 v[14:15], v[8:9], v23 +; SDAG-NEXT: v_or_b32_e32 v11, v11, v15 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v14 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22 +; SDAG-NEXT: v_cndmask_b32_e64 v14, v19, v11, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v10, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v23, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v22, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 -; SDAG-NEXT: v_cndmask_b32_e64 v13, v12, v5, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, v18, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v20, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 +; SDAG-NEXT: v_cndmask_b32_e64 v15, v14, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, v18, v4, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[14:15], v[8:9], v38 -; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 64, v38 +; SDAG-NEXT: v_lshr_b64 v[20:21], v[8:9], v38 +; SDAG-NEXT: v_sub_i32_e32 v12, vcc, 64, v38 ; SDAG-NEXT: v_subrev_i32_e32 v51, vcc, 64, v38 ; SDAG-NEXT: v_lshr_b64 v[22:23], v[4:5], v38 ; SDAG-NEXT: v_add_i32_e32 v50, vcc, -1, v37 ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v24 +; SDAG-NEXT: v_mov_b32_e32 v13, 0 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v12 ; SDAG-NEXT: v_lshr_b64 v[53:54], v[4:5], v51 ; SDAG-NEXT: v_addc_u32_e32 v51, vcc, -1, v36, vcc -; SDAG-NEXT: v_or_b32_e32 v15, v15, v25 -; SDAG-NEXT: v_or_b32_e32 v14, v14, v24 +; SDAG-NEXT: v_or_b32_e32 v12, v21, v25 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v24 ; SDAG-NEXT: v_addc_u32_e32 v52, vcc, -1, v6, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v38 -; SDAG-NEXT: v_cndmask_b32_e64 v15, v54, v15, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v14, v53, v14, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v54, v12, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v53, v20, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v22, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v53, vcc, -1, v7, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38 -; SDAG-NEXT: v_cndmask_b32_e32 v23, v15, v9, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v22, v14, v8, vcc -; SDAG-NEXT: v_mov_b32_e32 v15, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v23, v12, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v22, v20, v8, vcc +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: .LBB2_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v14, 31, v23 +; SDAG-NEXT: v_lshrrev_b32_e32 v12, 31, v23 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v54, 31, v13 -; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v54, 31, v15 +; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v55, 31, v11 ; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v24, v24, v14 +; SDAG-NEXT: v_or_b32_e32 v24, v24, v12 ; SDAG-NEXT: v_or_b32_e32 v22, v22, v54 -; SDAG-NEXT: v_or_b32_e32 v12, v12, v55 -; SDAG-NEXT: v_or_b32_e32 v13, v19, v13 +; SDAG-NEXT: v_or_b32_e32 v12, v14, v55 +; SDAG-NEXT: v_or_b32_e32 v15, v19, v15 ; SDAG-NEXT: v_or_b32_e32 v11, v21, v11 -; SDAG-NEXT: v_or_b32_e32 v12, v18, v12 -; SDAG-NEXT: v_sub_i32_e32 v14, vcc, v50, v22 -; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v51, v23, vcc -; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v52, v24, vcc -; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v53, v25, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v14 -; SDAG-NEXT: v_and_b32_e32 v14, 1, v21 +; SDAG-NEXT: v_or_b32_e32 v14, v18, v12 +; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v50, v22 +; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v51, v23, vcc +; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v52, v24, vcc +; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v53, v25, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v12 +; SDAG-NEXT: v_and_b32_e32 v12, 1, v21 ; SDAG-NEXT: v_and_b32_e32 v54, v21, v7 ; SDAG-NEXT: v_and_b32_e32 v55, v21, v6 ; SDAG-NEXT: v_and_b32_e32 v40, v21, v36 @@ -1933,83 +1933,83 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[54:55] ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; SDAG-NEXT: v_or_b32_e32 v10, v20, v10 -; SDAG-NEXT: v_mov_b32_e32 v21, v15 -; SDAG-NEXT: v_mov_b32_e32 v20, v14 +; SDAG-NEXT: v_mov_b32_e32 v21, v13 +; SDAG-NEXT: v_mov_b32_e32 v20, v12 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB2_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB2_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v11 ; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v12, v12, v20 -; SDAG-NEXT: v_or_b32_e32 v19, v19, v13 -; SDAG-NEXT: v_or_b32_e32 v15, v15, v11 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v12 -; SDAG-NEXT: v_or_b32_e32 v14, v14, v10 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v20 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v15 +; SDAG-NEXT: v_or_b32_e32 v13, v13, v11 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v14 +; SDAG-NEXT: v_or_b32_e32 v12, v12, v10 ; SDAG-NEXT: .LBB2_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v12, v33, v3 +; SDAG-NEXT: v_mul_lo_u32 v14, v33, v3 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0 ; SDAG-NEXT: v_mul_lo_u32 v24, v27, v2 ; SDAG-NEXT: v_mul_lo_u32 v25, v34, v31 ; SDAG-NEXT: v_mul_lo_u32 v34, v32, v30 ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0 -; SDAG-NEXT: v_mov_b32_e32 v13, 0 -; SDAG-NEXT: v_mul_lo_u32 v38, v14, v7 -; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v14, v6, 0 -; SDAG-NEXT: v_mul_lo_u32 v39, v15, v6 -; SDAG-NEXT: v_mul_lo_u32 v48, v19, v37 -; SDAG-NEXT: v_mul_lo_u32 v49, v18, v36 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v37, v14, 0 -; SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; SDAG-NEXT: v_mov_b32_e32 v12, v3 -; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[12:13] +; SDAG-NEXT: v_mov_b32_e32 v15, 0 +; SDAG-NEXT: v_mul_lo_u32 v38, v12, v7 +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v6, 0 +; SDAG-NEXT: v_mul_lo_u32 v39, v13, v6 +; SDAG-NEXT: v_mul_lo_u32 v19, v19, v37 +; SDAG-NEXT: v_mul_lo_u32 v48, v18, v36 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v37, v12, 0 +; SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; SDAG-NEXT: v_mov_b32_e32 v14, v3 +; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15] ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v2 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v21, v38 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v38 ; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v24 -; SDAG-NEXT: v_mov_b32_e32 v12, v22 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[12:13] +; SDAG-NEXT: v_mov_b32_e32 v14, v22 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[14:15] ; SDAG-NEXT: v_xor_b32_e32 v24, v16, v28 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v19, v39 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v39 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11] ; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v23, v3 ; SDAG-NEXT: v_addc_u32_e64 v23, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v31, vcc, v17, v2, vcc ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21] -; SDAG-NEXT: v_mov_b32_e32 v12, v7 -; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v36, v14, v[12:13] +; SDAG-NEXT: v_mov_b32_e32 v14, v7 +; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v36, v12, v[14:15] ; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v25, v11 -; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v30, v27, v[22:23] -; SDAG-NEXT: v_xor_b32_e32 v14, v31, v29 -; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3 -; SDAG-NEXT: v_mov_b32_e32 v12, v16 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v37, v15, v[12:13] +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v30, v27, v[22:23] +; SDAG-NEXT: v_xor_b32_e32 v18, v31, v29 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v19, v3 +; SDAG-NEXT: v_mov_b32_e32 v14, v16 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[14:15] ; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v34, v7 -; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v49, v3 -; SDAG-NEXT: v_add_i32_e64 v12, s[4:5], v17, v12 -; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 -; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v19, v7, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3 +; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v17, v15 +; SDAG-NEXT: v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v10, vcc -; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v36, v15, v[12:13] +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v36, v13, v[15:16] ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc ; SDAG-NEXT: v_xor_b32_e32 v7, v0, v28 -; SDAG-NEXT: v_add_i32_e32 v10, vcc, v12, v2 -; SDAG-NEXT: v_addc_u32_e32 v12, vcc, v13, v3, vcc +; SDAG-NEXT: v_add_i32_e32 v10, vcc, v10, v2 +; SDAG-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc ; SDAG-NEXT: v_xor_b32_e32 v3, v1, v29 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v24, v28 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v14, v29, vcc +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v18, v29, vcc ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v7, v28, vcc ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v29, vcc ; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v11, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v14, vcc ; SDAG-NEXT: v_xor_b32_e32 v6, v6, v26 ; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc ; SDAG-NEXT: v_xor_b32_e32 v7, v7, v35 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v12, vcc +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc ; SDAG-NEXT: v_xor_b32_e32 v8, v4, v26 ; SDAG-NEXT: v_xor_b32_e32 v9, v5, v35 ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v6, v26 @@ -2557,32 +2557,32 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[16:17], v[0:1], v30 -; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v30 +; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v30 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v30 ; SDAG-NEXT: v_subrev_i32_e32 v35, vcc, 64, v30 ; SDAG-NEXT: v_lshr_b64 v[26:27], v[2:3], v30 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v8 ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_mov_b32_e32 v24, 0 -; SDAG-NEXT: v_mov_b32_e32 v25, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_lshl_b64 v[28:29], v[2:3], v28 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_lshl_b64 v[28:29], v[2:3], v16 ; SDAG-NEXT: v_lshr_b64 v[37:38], v[2:3], v35 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v9, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v17, v29 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v28 +; SDAG-NEXT: v_or_b32_e32 v16, v25, v29 +; SDAG-NEXT: v_or_b32_e32 v24, v24, v28 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v10, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v38, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v37, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v38, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v24, v37, v24, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v11, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SDAG-NEXT: v_cndmask_b32_e32 v27, v17, v1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v26, v16, v0, vcc -; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v27, v16, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v26, v24, v0, vcc +; SDAG-NEXT: v_mov_b32_e32 v24, 0 +; SDAG-NEXT: v_mov_b32_e32 v25, 0 ; SDAG-NEXT: .LBB3_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v23 @@ -2674,108 +2674,108 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 +; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v18 ; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v17, v21 +; SDAG-NEXT: v_or_b32_e32 v17, v19, v21 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_and_b32_e32 v18, 1, v22 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_and_b32_e32 v16, 1, v22 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v23, v7, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v5, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v4, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v16 -; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 -; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v17, vcc +; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v18 +; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v18 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc ; SDAG-NEXT: v_lshl_b64 v[22:23], v[4:5], v22 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v20, vcc ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v21, vcc -; SDAG-NEXT: v_or_b32_e32 v20, v34, v36 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 -; SDAG-NEXT: v_or_b32_e32 v21, v35, v37 -; SDAG-NEXT: v_lshl_b64 v[16:17], v[6:7], v26 -; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 -; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v26 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] -; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v27 -; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v23, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v22, v16, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v25, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v24, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v6, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v19, v34, v36 +; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v18 +; SDAG-NEXT: v_or_b32_e32 v20, v35, v37 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[6:7], v28 +; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v28 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v28 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20] +; SDAG-NEXT: v_lshr_b64 v[18:19], v[4:5], v18 +; SDAG-NEXT: v_or_b32_e32 v19, v25, v19 +; SDAG-NEXT: v_or_b32_e32 v18, v24, v18 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v23, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v27, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v26, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[18:19], v[4:5], v34 -; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v34 +; SDAG-NEXT: v_lshr_b64 v[24:25], v[4:5], v34 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v34 ; SDAG-NEXT: v_subrev_i32_e32 v39, vcc, 64, v34 ; SDAG-NEXT: v_lshr_b64 v[26:27], v[6:7], v34 ; SDAG-NEXT: v_add_i32_e32 v38, vcc, -1, v12 ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 -; SDAG-NEXT: v_mov_b32_e32 v24, 0 -; SDAG-NEXT: v_mov_b32_e32 v25, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_lshl_b64 v[28:29], v[6:7], v28 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_lshl_b64 v[28:29], v[6:7], v16 ; SDAG-NEXT: v_lshr_b64 v[49:50], v[6:7], v39 ; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v19, v29 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v28 +; SDAG-NEXT: v_or_b32_e32 v16, v25, v29 +; SDAG-NEXT: v_or_b32_e32 v24, v24, v28 ; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v19, v50, v19, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v49, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v50, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v24, v49, v24, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 -; SDAG-NEXT: v_cndmask_b32_e32 v27, v19, v5, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v26, v18, v4, vcc -; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v27, v16, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v26, v24, v4, vcc +; SDAG-NEXT: v_mov_b32_e32 v24, 0 +; SDAG-NEXT: v_mov_b32_e32 v25, 0 ; SDAG-NEXT: .LBB3_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v27 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v27 ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v17 -; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v19 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v18, v28, v18 +; SDAG-NEXT: v_or_b32_e32 v16, v28, v16 ; SDAG-NEXT: v_or_b32_e32 v26, v26, v50 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v51 -; SDAG-NEXT: v_or_b32_e32 v17, v23, v17 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v51 +; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 ; SDAG-NEXT: v_or_b32_e32 v21, v25, v21 ; SDAG-NEXT: v_sub_i32_e32 v25, vcc, v38, v26 -; SDAG-NEXT: v_or_b32_e32 v16, v22, v16 +; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 ; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v39, v27, vcc -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v48, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v48, v16, vcc ; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v49, v29, vcc ; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v25 ; SDAG-NEXT: v_and_b32_e32 v28, v25, v12 @@ -2784,7 +2784,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_and_b32_e32 v52, v25, v15 ; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v28 ; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v50, vcc -; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v18, v51, vcc +; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v16, v51, vcc ; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v52, vcc ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v34 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc @@ -2793,69 +2793,69 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v50, v34, v36 ; SDAG-NEXT: v_or_b32_e32 v51, v35, v37 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[50:51] -; SDAG-NEXT: v_and_b32_e32 v18, 1, v25 +; SDAG-NEXT: v_and_b32_e32 v16, 1, v25 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; SDAG-NEXT: v_or_b32_e32 v20, v24, v20 -; SDAG-NEXT: v_mov_b32_e32 v25, v19 -; SDAG-NEXT: v_mov_b32_e32 v24, v18 +; SDAG-NEXT: v_mov_b32_e32 v25, v17 +; SDAG-NEXT: v_mov_b32_e32 v24, v16 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB3_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB3_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 -; SDAG-NEXT: v_or_b32_e32 v23, v23, v17 -; SDAG-NEXT: v_or_b32_e32 v19, v19, v21 -; SDAG-NEXT: v_or_b32_e32 v22, v22, v16 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v24 +; SDAG-NEXT: v_or_b32_e32 v23, v23, v19 +; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 +; SDAG-NEXT: v_or_b32_e32 v22, v22, v18 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 ; SDAG-NEXT: .LBB3_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mul_lo_u32 v20, v32, v11 -; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v32, v10, 0 +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v32, v10, 0 ; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10 ; SDAG-NEXT: v_mul_lo_u32 v29, v33, v8 ; SDAG-NEXT: v_mul_lo_u32 v33, v31, v9 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_mul_lo_u32 v34, v18, v15 -; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v18, v14, 0 -; SDAG-NEXT: v_mul_lo_u32 v35, v19, v14 -; SDAG-NEXT: v_mul_lo_u32 v36, v23, v12 -; SDAG-NEXT: v_mul_lo_u32 v37, v22, v13 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v18, 0 -; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; SDAG-NEXT: v_mul_lo_u32 v34, v16, v15 +; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v14, 0 +; SDAG-NEXT: v_mul_lo_u32 v35, v17, v14 +; SDAG-NEXT: v_mul_lo_u32 v23, v23, v12 +; SDAG-NEXT: v_mul_lo_u32 v36, v22, v13 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v16, 0 +; SDAG-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; SDAG-NEXT: v_mov_b32_e32 v20, v11 ; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[20:21] ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; SDAG-NEXT: v_add_i32_e64 v23, s[4:5], v25, v34 -; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v17, v28 +; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v25, v34 +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v19, v28 ; SDAG-NEXT: v_mov_b32_e32 v20, v26 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v23, v35 -; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v31, v8, v[16:17] +; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v25, v35 +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v31, v8, v[18:19] ; SDAG-NEXT: v_add_i32_e64 v26, s[4:5], v27, v11 ; SDAG-NEXT: v_addc_u32_e64 v27, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25] ; SDAG-NEXT: v_mov_b32_e32 v20, v15 -; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v18, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v29, v17 +; SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v13, v16, v[20:21] +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v29, v19 ; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[26:27] -; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v36, v11 -; SDAG-NEXT: v_mov_b32_e32 v20, v22 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v19, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v33, v15 -; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v37, v17 -; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v23, v12 -; SDAG-NEXT: v_addc_u32_e64 v18, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v16 -; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v15, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v23, v11 +; SDAG-NEXT: v_mov_b32_e32 v20, v15 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v17, v[20:21] +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v33, v19 +; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v36, v22 +; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v16, v12 +; SDAG-NEXT: v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v18 +; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v19, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v19, v[17:18] +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v17, v[15:16] ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; SDAG-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll index 8c3d20ffb02f..d588c22a8857 100644 --- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -20,7 +20,7 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA: ; %bb.0: ; %start ; ISA-NEXT: v_readfirstlane_b32 s0, v0 ; ISA-NEXT: s_mov_b32 m0, s0 -; ISA-NEXT: s_mov_b32 s10, 0 +; ISA-NEXT: s_mov_b32 s8, 0 ; ISA-NEXT: v_interp_p1_f32_e32 v0, v1, attr0.x ; ISA-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; ISA-NEXT: s_mov_b64 s[0:1], 0 @@ -30,40 +30,42 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: .LBB0_1: ; %Flow1 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; ISA-NEXT: s_or_b64 exec, exec, s[4:5] -; ISA-NEXT: s_mov_b64 s[8:9], 0 ; ISA-NEXT: s_mov_b64 s[4:5], s[6:7] +; ISA-NEXT: s_mov_b64 s[6:7], 0 ; ISA-NEXT: .LBB0_2: ; %Flow ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; ISA-NEXT: s_and_b64 s[6:7], exec, s[4:5] -; ISA-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; ISA-NEXT: s_and_b64 s[10:11], exec, s[4:5] +; ISA-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] ; ISA-NEXT: s_andn2_b64 s[2:3], s[2:3], exec -; ISA-NEXT: s_and_b64 s[6:7], s[8:9], exec +; ISA-NEXT: s_and_b64 s[6:7], s[6:7], exec ; ISA-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; ISA-NEXT: s_andn2_b64 exec, exec, s[0:1] -; ISA-NEXT: s_cbranch_execz .LBB0_6 +; ISA-NEXT: s_cbranch_execz .LBB0_7 ; ISA-NEXT: .LBB0_3: ; %loop ; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 ; ISA-NEXT: s_or_b64 s[4:5], s[4:5], exec +; ISA-NEXT: s_cmp_lt_u32 s8, 32 ; ISA-NEXT: s_mov_b64 s[6:7], -1 -; ISA-NEXT: s_cmp_lt_u32 s10, 32 -; ISA-NEXT: s_mov_b64 s[8:9], -1 -; ISA-NEXT: s_cbranch_scc0 .LBB0_2 +; ISA-NEXT: s_cbranch_scc0 .LBB0_6 ; ISA-NEXT: ; %bb.4: ; %endif1 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; ISA-NEXT: s_and_saveexec_b64 s[4:5], vcc ; ISA-NEXT: s_cbranch_execz .LBB0_1 ; ISA-NEXT: ; %bb.5: ; %endif2 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; ISA-NEXT: s_add_i32 s10, s10, 1 +; ISA-NEXT: s_add_i32 s8, s8, 1 ; ISA-NEXT: s_xor_b64 s[6:7], exec, -1 ; ISA-NEXT: s_branch .LBB0_1 -; ISA-NEXT: .LBB0_6: ; %Flow2 +; ISA-NEXT: .LBB0_6: ; in Loop: Header=BB0_3 Depth=1 +; ISA-NEXT: ; implicit-def: $sgpr8 +; ISA-NEXT: s_branch .LBB0_2 +; ISA-NEXT: .LBB0_7: ; %Flow2 ; ISA-NEXT: s_or_b64 exec, exec, s[0:1] ; ISA-NEXT: v_mov_b32_e32 v1, 0 ; ISA-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] -; ISA-NEXT: ; %bb.7: ; %if1 +; ISA-NEXT: ; %bb.8: ; %if1 ; ISA-NEXT: v_sqrt_f32_e32 v1, v0 -; ISA-NEXT: ; %bb.8: ; %endloop +; ISA-NEXT: ; %bb.9: ; %endloop ; ISA-NEXT: s_or_b64 exec, exec, s[0:1] ; ISA-NEXT: exp mrt0 v1, v1, v1, v1 done vm ; ISA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index dcfac6fdbfc7..614200803d6f 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -294,8 +294,8 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_mov_b64 s[10:11], s[6:7] ; SI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index 625ac12b9983..0d8a9f6aca34 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -349,8 +349,8 @@ define amdgpu_kernel void @v_extractelement_v8i16_2(ptr addrspace(1) %out, ptr a ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -379,8 +379,8 @@ define amdgpu_kernel void @v_extractelement_v8i16_6(ptr addrspace(1) %out, ptr a ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -465,8 +465,8 @@ define amdgpu_kernel void @v_extractelement_v16i16_2(ptr addrspace(1) %out, ptr ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 5, v0 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -495,8 +495,8 @@ define amdgpu_kernel void @v_extractelement_v16i16_6(ptr addrspace(1) %out, ptr ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 5, v0 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll index 899cc8940544..8cf91aa90066 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll @@ -2129,14 +2129,23 @@ define double @v_fma_mul_add_32_f64(double %x, double %y) { } define <2 x double> @v_fma_mul_add_32_v2f64(<2 x double> %x, <2 x double> %y) { -; GFX9-LABEL: v_fma_mul_add_32_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40400000 -; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], s[4:5], v[4:5] -; GFX9-NEXT: v_fma_f64 v[2:3], v[2:3], s[4:5], v[6:7] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: v_fma_mul_add_32_v2f64: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, 0x40400000 +; GFX9-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], s[4:5], v[4:5] +; GFX9-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], s[4:5], v[6:7] +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: v_fma_mul_add_32_v2f64: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v9, 0x40400000 +; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[4:5] +; GFX9-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[6:7] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1011-LABEL: v_fma_mul_add_32_v2f64: ; GFX1011: ; %bb.0: @@ -2485,10 +2494,10 @@ define <2 x double> @v_mul_16_v2f64(<2 x double> %x) { ; GFX9-GISEL-LABEL: v_mul_16_v2f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40300000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] -; GFX9-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x40300000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX9-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], v[4:5] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_16_v2f64: @@ -2533,10 +2542,10 @@ define <2 x double> @v_mul_neg16_v2f64(<2 x double> %x) { ; GFX9-GISEL-LABEL: v_mul_neg16_v2f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0xc0300000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] -; GFX9-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xc0300000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX9-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], v[4:5] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_neg16_v2f64: @@ -2581,10 +2590,10 @@ define <2 x double> @v_mul_fabs_16_v2f64(<2 x double> %x) { ; GFX9-GISEL-LABEL: v_mul_fabs_16_v2f64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0x40300000 -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, s[4:5] -; GFX9-GISEL-NEXT: v_mul_f64 v[2:3], |v[2:3]|, s[4:5] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x40300000 +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, v[4:5] +; GFX9-GISEL-NEXT: v_mul_f64 v[2:3], |v[2:3]|, v[4:5] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_fabs_16_v2f64: diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index cdd34cbde6dd..e7af7467171c 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -11,8 +11,8 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0x3fe ; SDAG-NEXT: v_mov_b32_e32 v4, v0 -; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] ; SDAG-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 @@ -57,33 +57,34 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 -; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v5, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 -; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] ; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v2, v1 -; SDAG-NEXT: v_mul_lo_u32 v14, v10, v6 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[2:3] -; SDAG-NEXT: ; implicit-def: $vgpr10 -; SDAG-NEXT: v_add3_u32 v5, v5, v14, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v6 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v12, v[4:5] -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v6, v9, v11 +; SDAG-NEXT: v_mov_b32_e32 v4, v1 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5] +; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v4, v6 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] +; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 ; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v5, v9, v5, v6 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v2, v4 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v5, s[4:5] +; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr9 @@ -376,8 +377,8 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0x3fe ; SDAG-NEXT: v_mov_b32_e32 v4, v0 -; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] ; SDAG-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 @@ -422,33 +423,34 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 -; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v5, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 -; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] ; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v2, v1 -; SDAG-NEXT: v_mul_lo_u32 v14, v10, v6 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[2:3] -; SDAG-NEXT: ; implicit-def: $vgpr10 -; SDAG-NEXT: v_add3_u32 v5, v5, v14, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v6 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v12, v[4:5] -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v6, v9, v11 +; SDAG-NEXT: v_mov_b32_e32 v4, v1 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5] +; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v4, v6 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] +; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 ; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v5, v9, v5, v6 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v2, v4 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v5, s[4:5] +; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr9 @@ -737,17 +739,17 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG: ; %bb.0: ; %fp-to-i-entry ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e -; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 ; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc ; SDAG-NEXT: s_cbranch_execz .LBB2_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end +; SDAG-NEXT: v_mov_b32_e32 v6, 0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc @@ -765,14 +767,14 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: s_cbranch_execz .LBB2_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 +; SDAG-NEXT: v_add_co_u32_e64 v10, s[4:5], -1, v0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] -; SDAG-NEXT: v_mov_b32_e32 v7, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 +; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v7, 0x800000, v0 +; SDAG-NEXT: v_mov_b32_e32 v8, v6 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] @@ -782,56 +784,56 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5 ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] -; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[7:8] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[7:8] ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 -; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, v4 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v8, v[6:7] -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v3, s[4:5], v5, v7 -; SDAG-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v10, v9, v11 -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v8, v[3:4] -; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v5, v9, v2, v10 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v3, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v4, v5, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v6 -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 -; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 -; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, 0 +; SDAG-NEXT: v_mul_lo_u32 v4, v9, v2 +; SDAG-NEXT: v_mul_lo_u32 v14, v11, v3 +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v7 +; SDAG-NEXT: v_mul_lo_u32 v7, v10, v13 +; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v9, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v13, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v3, v10, v12 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v9, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 +; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB2_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB2_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[7:8] ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v7, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB2_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -1088,17 +1090,17 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG: ; %bb.0: ; %fp-to-i-entry ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e -; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 ; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc ; SDAG-NEXT: s_cbranch_execz .LBB3_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end +; SDAG-NEXT: v_mov_b32_e32 v6, 0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc @@ -1116,14 +1118,14 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: s_cbranch_execz .LBB3_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 +; SDAG-NEXT: v_add_co_u32_e64 v10, s[4:5], -1, v0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] -; SDAG-NEXT: v_mov_b32_e32 v7, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 +; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v7, 0x800000, v0 +; SDAG-NEXT: v_mov_b32_e32 v8, v6 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] @@ -1133,56 +1135,56 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5 ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] -; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[7:8] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[7:8] ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 -; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, v4 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v8, v[6:7] -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v3, s[4:5], v5, v7 -; SDAG-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v10, v9, v11 -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v8, v[3:4] -; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v5, v9, v2, v10 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v3, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v4, v5, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v6 -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 -; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 -; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, 0 +; SDAG-NEXT: v_mul_lo_u32 v4, v9, v2 +; SDAG-NEXT: v_mul_lo_u32 v14, v11, v3 +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v7 +; SDAG-NEXT: v_mul_lo_u32 v7, v10, v13 +; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v9, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v13, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v3, v10, v12 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v9, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 +; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB3_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB3_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[7:8] ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v7, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB3_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -1477,17 +1479,17 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG: ; %bb.0: ; %fp-to-i-entry ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_bfe_u32 v5, v4, 7, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e -; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 ; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc ; SDAG-NEXT: s_cbranch_execz .LBB6_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end +; SDAG-NEXT: v_mov_b32_e32 v6, 0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc @@ -1508,10 +1510,10 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: s_mov_b64 s[4:5], 0x85 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] -; SDAG-NEXT: v_mov_b32_e32 v7, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v6, 0x80, v0 +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v7, 0x80, v0 +; SDAG-NEXT: v_mov_b32_e32 v8, v6 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] @@ -1519,56 +1521,56 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: s_cbranch_execz .LBB6_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_add_co_u32_e64 v10, s[4:5], -1, v0 +; SDAG-NEXT: v_add_co_u32_e64 v11, s[4:5], -1, v0 ; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v5 ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff7a, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] -; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[7:8] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[7:8] ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0 -; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 -; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, v4 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v9, v[6:7] -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v3, s[4:5], v5, v7 -; SDAG-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v8, v10, v11 -; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, v[3:4] -; SDAG-NEXT: v_add3_u32 v5, v10, v2, v8 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v3, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v4, v5, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v6 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, 0 +; SDAG-NEXT: v_mul_lo_u32 v4, v10, v2 +; SDAG-NEXT: v_mul_lo_u32 v14, v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v7 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 +; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v10, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v3, v11, v12 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 -; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 -; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB6_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB6_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[7:8] ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; SDAG-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] +; SDAG-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v9 ; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8 +; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v9 ; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: .LBB6_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -1824,17 +1826,17 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG: ; %bb.0: ; %fp-to-i-entry ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_bfe_u32 v5, v4, 7, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e -; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 ; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc ; SDAG-NEXT: s_cbranch_execz .LBB7_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end +; SDAG-NEXT: v_mov_b32_e32 v6, 0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc @@ -1855,10 +1857,10 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: s_mov_b64 s[4:5], 0x85 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] -; SDAG-NEXT: v_mov_b32_e32 v7, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v6, 0x80, v0 +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v7, 0x80, v0 +; SDAG-NEXT: v_mov_b32_e32 v8, v6 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] @@ -1866,56 +1868,56 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: s_cbranch_execz .LBB7_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_add_co_u32_e64 v10, s[4:5], -1, v0 +; SDAG-NEXT: v_add_co_u32_e64 v11, s[4:5], -1, v0 ; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v5 ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff7a, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] -; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[7:8] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[7:8] ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0 -; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 -; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, v4 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v9, v[6:7] -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v3, s[4:5], v5, v7 -; SDAG-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v8, v10, v11 -; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, v[3:4] -; SDAG-NEXT: v_add3_u32 v5, v10, v2, v8 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v3, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v4, v5, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v6 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, 0 +; SDAG-NEXT: v_mul_lo_u32 v4, v10, v2 +; SDAG-NEXT: v_mul_lo_u32 v14, v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v7 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 +; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v10, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v3, v11, v12 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 -; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 -; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB7_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB7_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5 -; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[7:8] ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; SDAG-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] +; SDAG-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v9 ; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8 +; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v9 ; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: .LBB7_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll index cffa287dd91f..18567ef647d8 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -2005,11 +2005,9 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) { ; GFX6-GISEL-LABEL: v_sqrt_v2f64_afn: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX6-GISEL-NEXT: s_brev_b32 s5, 8 -; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, s5 -; GFX6-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] ; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] ; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] @@ -2054,11 +2052,9 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) { ; GFX8-GISEL-LABEL: v_sqrt_v2f64_afn: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX8-GISEL-NEXT: s_brev_b32 s5, 8 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] ; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] @@ -2548,11 +2544,9 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) { ; GFX6-GISEL-LABEL: v_sqrt_v2f64_afn_nnan_ninf: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX6-GISEL-NEXT: s_brev_b32 s5, 8 -; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, s5 -; GFX6-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] ; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] ; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] @@ -2597,11 +2591,9 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) { ; GFX8-GISEL-LABEL: v_sqrt_v2f64_afn_nnan_ninf: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX8-GISEL-NEXT: s_brev_b32 s5, 8 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] ; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] @@ -3206,11 +3198,9 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) { ; GFX6-GISEL-LABEL: v_sqrt_v2f64: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX6-GISEL-NEXT: s_brev_b32 s5, 8 -; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, s5 -; GFX6-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] ; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] ; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] @@ -3255,11 +3245,9 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) { ; GFX8-GISEL-LABEL: v_sqrt_v2f64: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX8-GISEL-NEXT: s_brev_b32 s5, 8 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] ; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] @@ -3436,23 +3424,21 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { ; GFX6-GISEL-LABEL: v_sqrt_v3f64: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX6-GISEL-NEXT: s_brev_b32 s5, 8 -; GFX6-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 -; GFX6-GISEL-NEXT: v_mov_b32_e32 v6, s4 -; GFX6-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX6-GISEL-NEXT: v_mov_b32_e32 v6, 0 +; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v7, 8 +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] ; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7] ; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7] +; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 ; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 -; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[0:1] ; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] ; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX6-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[0:1] ; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] ; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[12:13], v[4:5] ; GFX6-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], 0.5 @@ -3504,23 +3490,21 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { ; GFX8-GISEL-LABEL: v_sqrt_v3f64: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX8-GISEL-NEXT: s_brev_b32 s5, 8 -; GFX8-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, 0 +; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v7, 8 +; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] ; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7] ; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 -; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[0:1] ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] ; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX8-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[0:1] ; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] ; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[12:13], v[4:5] ; GFX8-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], 0.5 diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 56ceba258f47..2c03113e8af4 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -41,15 +41,14 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v5, s69, 19 ; CHECK-NEXT: v_writelane_b32 v5, s70, 20 ; CHECK-NEXT: s_mov_b32 s68, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_writelane_b32 v5, s71, 21 -; CHECK-NEXT: v_mov_b32_e32 v2, s4 -; CHECK-NEXT: v_mov_b32_e32 v3, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_mov_b32 s69, s68 ; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: s_mov_b32 s71, s68 -; CHECK-NEXT: image_sample_lz v3, v[2:3], s[16:23], s[68:71] dmask:0x1 -; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: image_sample_lz v3, v[1:2], s[16:23], s[68:71] dmask:0x1 +; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b32 s6, 48 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -86,7 +85,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 ; CHECK-NEXT: image_sample_lz v3, v[1:2], s[16:23], s[68:71] dmask:0x1 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll index f582f984a392..9f5bbf834fdf 100644 --- a/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll +++ b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll @@ -9,9 +9,9 @@ define amdgpu_kernel void @func(ptr addrspace(1) %in, ptr addrspace(3) %out) { ; CHECK: ; %bb.0: ; %.lr.ph ; CHECK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: s_mov_b64 s[10:11], 0 ; CHECK-NEXT: s_mov_b32 s3, 32 ; CHECK-NEXT: s_mov_b32 s2, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], 0 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 ; CHECK-NEXT: .LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll index fcdad5355382..50bf63253337 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll @@ -17,7 +17,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(<1 x i64> %L1) { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b128 v0, v[2:5] ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, v2 +; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: ds_write_b128 v0, v[30:33] offset:112 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 78be949baaba..c1508c1675fe 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -5058,10 +5058,10 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 2.0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] @@ -5075,10 +5075,10 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 2.0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v2, a[0:3] @@ -5222,7 +5222,6 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, 2.0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 @@ -5238,6 +5237,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] @@ -5254,7 +5254,6 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: v_accvgpr_write_b32 a15, 2.0 ; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 @@ -5270,6 +5269,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 +; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v2, a[0:15] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index 3a4bf1c81ed5..eed67d9e020d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -1939,40 +1939,40 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) { ; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX6-GISEL-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX6-GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GFX6-GISEL-NEXT: v_mov_b32_e32 v8, 0x7ff00000 ; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[5:6], v[0:1] ; GFX6-GISEL-NEXT: v_frexp_exp_i32_f64_e32 v4, v[0:1] -; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[7:8] +; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[9:10], v[2:3] ; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[6:7], v[2:3] ; GFX6-GISEL-NEXT: v_frexp_exp_i32_f64_e32 v5, v[2:3] -; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5] +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, v[7:8] ; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) ret { <2 x double>, <2 x i32> } %result } define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) { -; GFX6-LABEL: test_frexp_v2f64_v2i32_only_use_fract: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-NEXT: v_frexp_mant_f64_e32 v[4:5], v[2:3] -; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: s_setpc_b64 s[30:31] +; GFX6-SDAG-LABEL: test_frexp_v2f64_v2i32_only_use_fract: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX6-SDAG-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] +; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[4:5], v[2:3] +; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_v2f64_v2i32_only_use_fract: ; GFX8: ; %bb.0: @@ -2005,24 +2005,39 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) { ; GFX12-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] ; GFX12-NEXT: v_frexp_mant_f64_e32 v[2:3], v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32_only_use_fract: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[6:7], v[0:1] +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[4:5] +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[6:7], v[2:3] +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, v[4:5] +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) %result.0 = extractvalue { <2 x double>, <2 x i32> } %result, 0 ret <2 x double> %result.0 } define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) { -; GFX6-LABEL: test_frexp_v2f64_v2i32_only_use_exp: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-NEXT: v_frexp_exp_i32_f64_e32 v4, v[0:1] -; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX6-NEXT: v_frexp_exp_i32_f64_e32 v1, v[2:3] -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX6-NEXT: s_setpc_b64 s[30:31] +; GFX6-SDAG-LABEL: test_frexp_v2f64_v2i32_only_use_exp: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX6-SDAG-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX6-SDAG-NEXT: v_frexp_exp_i32_f64_e32 v4, v[0:1] +; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-SDAG-NEXT: v_frexp_exp_i32_f64_e32 v1, v[2:3] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_v2f64_v2i32_only_use_exp: ; GFX8: ; %bb.0: @@ -2055,6 +2070,19 @@ define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) { ; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] ; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v1, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32_only_use_exp: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX6-GISEL-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[4:5] +; GFX6-GISEL-NEXT: v_frexp_exp_i32_f64_e32 v1, v[2:3] +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, v[4:5] +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) %result.1 = extractvalue { <2 x double>, <2 x i32> } %result, 1 ret <2 x i32> %result.1 @@ -2079,3 +2107,4 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo ; GFX11-SDAG: {{.*}} ; GFX12-GISEL: {{.*}} ; GFX12-SDAG: {{.*}} +; GFX6: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll index 41eeeaf51df9..320d3c77a6d9 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll @@ -1049,9 +1049,9 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) ; SI-STD: ; %bb.0: ; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-STD-NEXT: s_load_dword s6, s[4:5], 0xd -; SI-STD-NEXT: s_mov_b32 s2, 0 ; SI-STD-NEXT: v_mov_b32_e32 v1, 0 ; SI-STD-NEXT: s_mov_b32 s3, 0xf000 +; SI-STD-NEXT: s_mov_b32 s2, 0 ; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-STD-NEXT: s_waitcnt lgkmcnt(0) ; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc @@ -1097,9 +1097,9 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) ; SI-DENORM-FASTFMAF: ; %bb.0: ; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-DENORM-FASTFMAF-NEXT: s_load_dword s6, s[4:5], 0xd -; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0 ; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0) ; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc @@ -1145,9 +1145,9 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) ; SI-DENORM-SLOWFMAF: ; %bb.0: ; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s6, s[4:5], 0xd -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0 ; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0) ; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc @@ -1277,9 +1277,9 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) ; SI-STD: ; %bb.0: ; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-STD-NEXT: s_load_dword s6, s[4:5], 0xd -; SI-STD-NEXT: s_mov_b32 s2, 0 ; SI-STD-NEXT: v_mov_b32_e32 v1, 0 ; SI-STD-NEXT: s_mov_b32 s3, 0xf000 +; SI-STD-NEXT: s_mov_b32 s2, 0 ; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-STD-NEXT: s_waitcnt lgkmcnt(0) ; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc @@ -1325,9 +1325,9 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) ; SI-DENORM-FASTFMAF: ; %bb.0: ; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-DENORM-FASTFMAF-NEXT: s_load_dword s6, s[4:5], 0xd -; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0 ; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0) ; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc @@ -1373,9 +1373,9 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) ; SI-DENORM-SLOWFMAF: ; %bb.0: ; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s6, s[4:5], 0xd -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0 ; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0) ; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc @@ -1457,9 +1457,9 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) ; SI-STD: ; %bb.0: ; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-STD-NEXT: s_load_dword s6, s[4:5], 0xd -; SI-STD-NEXT: s_mov_b32 s2, 0 ; SI-STD-NEXT: v_mov_b32_e32 v1, 0 ; SI-STD-NEXT: s_mov_b32 s3, 0xf000 +; SI-STD-NEXT: s_mov_b32 s2, 0 ; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-STD-NEXT: s_waitcnt lgkmcnt(0) ; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc @@ -1505,9 +1505,9 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) ; SI-DENORM-FASTFMAF: ; %bb.0: ; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-DENORM-FASTFMAF-NEXT: s_load_dword s6, s[4:5], 0xd -; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0 ; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0) ; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc @@ -1553,9 +1553,9 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) ; SI-DENORM-SLOWFMAF: ; %bb.0: ; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s6, s[4:5], 0xd -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0 ; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0) ; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc diff --git a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll index 3b855a56a5ab..deb97a9812b4 100644 --- a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll +++ b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll @@ -217,7 +217,6 @@ define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg noc ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v15, 0 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, 0 @@ -233,6 +232,7 @@ define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg noc ; GFX942-NEXT: v_mov_b32_e32 v19, 0 ; GFX942-NEXT: v_mov_b32_e32 v13, 0 ; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v15, 0 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB8_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 1870d1bcb180..e29da3a6b000 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -310,8 +310,8 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: s_mov_b64 s[6:7], s[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm-multi-use.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm-multi-use.mir new file mode 100644 index 000000000000..cf515e7d9286 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm-multi-use.mir @@ -0,0 +1,94 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -run-pass=peephole-opt -o - %s | FileCheck %s + +# Breaking mov of 64-bit inline immediate will increase instruction +# count. +--- +name: no_break_s_mov_b64_multi_use_copy_inline_imm_extract +body: | + bb.0: + + ; GCN-LABEL: name: no_break_s_mov_b64_multi_use_inline_imm_extract + ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_MOV_B64_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed [[S_MOV_B64_]].sub1 + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]], [[COPY1]] + ; CHECK-LABEL: name: no_break_s_mov_b64_multi_use_copy_inline_imm_extract + ; CHECK: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_MOV_B64_]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed [[S_MOV_B64_]].sub1 + ; CHECK-NEXT: SI_RETURN_TO_EPILOG [[COPY]], [[COPY1]] + %0:sreg_64 = S_MOV_B64 0 + %1:sreg_32 = COPY killed %0.sub0 + %2:sreg_32 = COPY killed %0.sub1 + SI_RETURN_TO_EPILOG %1, %2 + +... + +--- +name: no_break_v_mov_b64_multi_use_copy_inline_imm_extract +body: | + bb.0: + + ; GCN-LABEL: name: no_break_s_mov_b64_multi_use_inline_imm_extract + ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_MOV_B64_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed [[S_MOV_B64_]].sub1 + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]], [[COPY1]] + ; CHECK-LABEL: name: no_break_v_mov_b64_multi_use_copy_inline_imm_extract + ; CHECK: [[V_MOV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 0, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed [[V_MOV_B64_e64_]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[V_MOV_B64_e64_]].sub1 + ; CHECK-NEXT: SI_RETURN_TO_EPILOG [[COPY]], [[COPY1]] + %0:vreg_64_align2 = V_MOV_B64_e64 0, implicit $exec + %1:vgpr_32 = COPY killed %0.sub0 + %2:vgpr_32 = COPY killed %0.sub1 + SI_RETURN_TO_EPILOG %1, %2 + +... + +# The high half extract is an inline immediate in the use context, so +# this should fold despite multiple uses. +--- +name: break_s_mov_b64_multi_use_copy_extract_use_is_inline_imm +body: | + bb.0: + + ; GCN-LABEL: name: break_s_mov_b64_multi_use_extract_use_is_inline_imm + ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -96 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_MOV_B]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed [[S_MOV_B]].sub1 + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]], [[COPY1]] + ; CHECK-LABEL: name: break_s_mov_b64_multi_use_copy_extract_use_is_inline_imm + ; CHECK: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -96 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_MOV_B]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed [[S_MOV_B]].sub1 + ; CHECK-NEXT: SI_RETURN_TO_EPILOG [[COPY]], [[COPY1]] + %0:sreg_64 = S_MOV_B64_IMM_PSEUDO -96 + %1:sreg_32 = COPY killed %0.sub0 + %2:sreg_32 = COPY killed %0.sub1 + SI_RETURN_TO_EPILOG %1, %2 + +... + +--- +name: break_v_mov_b64_multi_use_copy_extract_use_is_inline_imm +body: | + bb.0: + + ; GCN-LABEL: name: break_s_mov_b64_multi_use_extract_use_is_inline_imm + ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -96 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_MOV_B]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed [[S_MOV_B]].sub1 + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]], [[COPY1]] + ; CHECK-LABEL: name: break_v_mov_b64_multi_use_copy_extract_use_is_inline_imm + ; CHECK: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -96, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed [[V_MOV_B]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[V_MOV_B]].sub1 + ; CHECK-NEXT: SI_RETURN_TO_EPILOG [[COPY]], [[COPY1]] + %0:vreg_64_align2 = V_MOV_B64_PSEUDO -96, implicit $exec + %1:vgpr_32 = COPY killed %0.sub0 + %2:vgpr_32 = COPY killed %0.sub1 + SI_RETURN_TO_EPILOG %1, %2 + +... diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index f0c8fed92567..5d0db8fd55d9 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -146,8 +146,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v18, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, -1, v5, vcc ; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_mov_b32_e32 v19, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v19, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -423,31 +423,31 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: s_mov_b32 s13, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s13 +; GFX9-O0-NEXT: s_mov_b32 s12, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 ; GFX9-O0-NEXT: v_min_u32_e64 v7, v7, v8 -; GFX9-O0-NEXT: s_mov_b32 s12, 0 -; GFX9-O0-NEXT: ; implicit-def: $sgpr14 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s12 +; GFX9-O0-NEXT: s_mov_b32 s13, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr13 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s13 +; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 ; GFX9-O0-NEXT: v_min_u32_e64 v12, v6, v9 -; GFX9-O0-NEXT: ; implicit-def: $sgpr14 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr13 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 ; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 ; GFX9-O0-NEXT: s_mov_b32 s16, s14 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-O0-NEXT: s_mov_b32 s18, s15 +; GFX9-O0-NEXT: s_mov_b32 s13, s15 ; GFX9-O0-NEXT: v_add_co_u32_e64 v9, s[16:17], v9, s16 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s18 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s13 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[16:17], v6, v10, s[16:17] ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 @@ -463,20 +463,20 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr16 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr13 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 ; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr13 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 @@ -581,17 +581,17 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 4 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-O0-NEXT: s_branch .LBB0_8 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 @@ -623,9 +623,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 @@ -683,9 +683,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_3 ; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 @@ -714,9 +714,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 @@ -908,9 +908,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 10 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 11 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill @@ -939,9 +939,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 ; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload @@ -1041,9 +1041,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill @@ -1070,9 +1070,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -1199,9 +1199,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 @@ -1247,8 +1247,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec ; GFX9-O0-NEXT: s_mov_b32 s5, 0 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v17 @@ -1269,17 +1269,17 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v8, v0, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v19 ; GFX9-O0-NEXT: v_add3_u32 v8, v8, v9, v14 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 ; GFX9-O0-NEXT: v_lshlrev_b64 v[8:9], s4, v[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v9 ; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v19, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 @@ -1299,16 +1299,16 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 ; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v5, v1, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v14 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v19, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v19 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v9 ; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] @@ -1321,16 +1321,16 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v21, v8 ; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v5, v2, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, s6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 ; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] @@ -1343,8 +1343,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v5 ; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v0, v2, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v22 @@ -1357,25 +1357,25 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff -; GFX9-O0-NEXT: s_mov_b32 s8, s7 -; GFX9-O0-NEXT: v_and_b32_e64 v2, v2, s8 +; GFX9-O0-NEXT: s_mov_b32 s5, s7 +; GFX9-O0-NEXT: v_and_b32_e64 v2, v2, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GFX9-O0-NEXT: v_and_b32_e64 v18, v5, s6 +; GFX9-O0-NEXT: s_mov_b32 s5, s6 +; GFX9-O0-NEXT: v_and_b32_e64 v18, v5, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 ; GFX9-O0-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v0, v1, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v22 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v5 ; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s4, v[22:23] @@ -1423,8 +1423,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 @@ -1621,8 +1621,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v20, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v7, vcc ; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mov_b32_e32 v21, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v21, 0 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1799,31 +1799,31 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: s_mov_b32 s9, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: s_mov_b32 s8, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 -; GFX9-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 ; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_mov_b32 s12, s10 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 -; GFX9-O0-NEXT: s_mov_b32 s14, s11 +; GFX9-O0-NEXT: s_mov_b32 s9, s11 ; GFX9-O0-NEXT: v_add_co_u32_e64 v7, s[12:13], v7, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s9 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v8, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 @@ -1834,25 +1834,25 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v5, v6, s[12:13] -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 ; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr9 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 @@ -2619,8 +2619,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: s_mov_b32 s5, 0 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 @@ -2641,17 +2641,17 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v2, v4, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 ; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v15 @@ -2671,16 +2671,16 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 ; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v7, v5, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 ; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] @@ -2693,16 +2693,16 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v7, v6, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, s6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 ; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] @@ -2715,8 +2715,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v19, v7 ; GFX9-O0-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v4, v6, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v18 @@ -2729,25 +2729,25 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff -; GFX9-O0-NEXT: s_mov_b32 s8, s7 -; GFX9-O0-NEXT: v_and_b32_e64 v14, v14, s8 +; GFX9-O0-NEXT: s_mov_b32 s5, s7 +; GFX9-O0-NEXT: v_and_b32_e64 v14, v14, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GFX9-O0-NEXT: v_and_b32_e64 v16, v15, s6 +; GFX9-O0-NEXT: s_mov_b32 s5, s6 +; GFX9-O0-NEXT: v_and_b32_e64 v16, v15, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 ; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v4, v5, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v14 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v19, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v19 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 ; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] @@ -2795,8 +2795,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], s4, v[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll index 59a1fe041bf9..cf3edc0b4ac9 100644 --- a/llvm/test/CodeGen/AMDGPU/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll @@ -1258,17 +1258,17 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) { ; GFX6-NEXT: v_mov_b32_e32 v4, 0 ; GFX6-NEXT: v_or_b32_e32 v5, 0x43300000, v5 ; GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] -; GFX6-NEXT: s_mov_b32 s4, -1 -; GFX6-NEXT: s_mov_b32 s5, 0x432fffff +; GFX6-NEXT: v_mov_b32_e32 v8, -1 +; GFX6-NEXT: v_mov_b32_e32 v9, 0x432fffff ; GFX6-NEXT: v_add_f64 v[5:6], v[6:7], -v[4:5] -; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, v[8:9] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GFX6-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX6-NEXT: v_or_b32_e32 v5, 0x43300000, v5 -; GFX6-NEXT: v_add_f64 v[7:8], v[2:3], v[4:5] +; GFX6-NEXT: v_add_f64 v[10:11], v[2:3], v[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX6-NEXT: v_add_f64 v[4:5], v[7:8], -v[4:5] -; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[4:5] +; GFX6-NEXT: v_add_f64 v[4:5], v[10:11], -v[4:5] +; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[2:3]|, v[8:9] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll index 4aac193d6aea..e34fdd9ae690 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll @@ -1660,33 +1660,31 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-LABEL: v_rsq_v2f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v10, s4 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 -; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] -; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 ; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 -; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] ; SI-GISEL-NEXT: v_mov_b32_e32 v18, 0x3ff00000 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v10, 8, v12 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v10 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 ; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 ; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 @@ -1807,11 +1805,9 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; VI-GISEL-LABEL: v_rsq_v2f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] ; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] ; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 @@ -1842,15 +1838,15 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] ; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc -; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 ; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 ; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 ; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], 1.0 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], 1.0 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 ; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 @@ -1960,33 +1956,31 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-LABEL: v_neg_rsq_v2f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v10, s4 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 -; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] -; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 ; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 -; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] ; SI-GISEL-NEXT: v_mov_b32_e32 v18, 0xbff00000 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v10, 8, v12 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v10 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 ; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 ; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 @@ -2107,11 +2101,9 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; VI-GISEL-LABEL: v_neg_rsq_v2f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] ; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] ; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 @@ -2142,15 +2134,15 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] ; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc -; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 ; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 ; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 ; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0 ; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0 @@ -2229,38 +2221,36 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; SI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v10, s4 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 -; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] -; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v12 +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 -; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] -; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11] -; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc @@ -2349,11 +2339,9 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; VI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] ; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] ; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 @@ -2384,15 +2372,15 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] ; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc -; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 ; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 ; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 ; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5] @@ -2503,38 +2491,36 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-LABEL: v_neg_pos_rsq_v2f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v10, s4 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 -; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] -; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v12 +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 -; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] -; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11] -; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc @@ -2652,11 +2638,9 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; VI-GISEL-LABEL: v_neg_pos_rsq_v2f64: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] ; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] ; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 @@ -2687,15 +2671,15 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7] ; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc -; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] +; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 ; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 ; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 ; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] ; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 ; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 @@ -4317,48 +4301,46 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) { ; SI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b32 s4, 0 -; SI-GISEL-NEXT: s_brev_b32 s5, 8 -; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-GISEL-NEXT: v_mov_b32_e32 v10, s4 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 -; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] -; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; SI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] +; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v12 +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 -; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] -; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11] -; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9] -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9] ; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v12, s[4:5] ; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[0:1] @@ -4440,11 +4422,9 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) { ; VI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf: ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_brev_b32 s5, 8 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 -; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; VI-GISEL-NEXT: v_bfrev_b32_e32 v5, 8 +; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] ; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5] ; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] @@ -5612,15 +5592,15 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; SI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 ; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] -; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80 +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xffffff80 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-GISEL-NEXT: v_mov_b32_e32 v11, 0x260 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SI-GISEL-NEXT: s_mov_b32 s6, 0 -; SI-GISEL-NEXT: s_mov_b32 s7, 0x40700000 -; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x40700000 +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0 +; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x40700000 +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0x40700000 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 @@ -5630,25 +5610,25 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] -; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc ; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v11 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7] -; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], s[6:7], v[0:1], s[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], v[8:9] +; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[8:9], v[0:1], v[8:9] ; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v10 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v12 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[10:11] ; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], v[8:9] ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_div_const_contract_sqrt_f64: @@ -5698,8 +5678,6 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; VI-GISEL-NEXT: v_bfrev_b32_e32 v3, 8 ; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] -; VI-GISEL-NEXT: s_mov_b32 s4, 0 -; VI-GISEL-NEXT: s_mov_b32 s5, 0x40700000 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 @@ -5718,19 +5696,21 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 ; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x40700000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5] -; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, s[4:5], v[0:1], s[4:5] -; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 -; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] -; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] -; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], v[4:5] +; VI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[0:1], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], v[8:9] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[6:7], v[10:11] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], v[4:5] ; VI-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract double @llvm.sqrt.f64(double %x) %rsq = fdiv contract double 256.0, %sqrt diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 4addf42b2798..5e76c7d7c734 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1747,8 +1747,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index 6be41fb8889b..59a884c82931 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -125,8 +125,8 @@ define amdgpu_kernel void @v_uextract_bit_32_i64(ptr addrspace(1) %out, ptr addr ; GCN-LABEL: v_uextract_bit_32_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -470,8 +470,8 @@ define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(ptr addrspace(1) %out ; GCN-LABEL: v_uextract_bit_33_i64_trunc_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -627,8 +627,8 @@ define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(ptr add ; GCN-LABEL: v_uextract_bit_33_36_use_upper_half_shift_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index 101787abf8ea..76bf9176143f 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -401,11 +401,11 @@ define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: v_select_sint_to_fp_i1_vals_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, 0xbff00000 +; GCN-NEXT: v_mov_b32_e32 v3, 0xbff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -414,10 +414,10 @@ define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0xbff00000 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc -; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX942-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 @@ -482,11 +482,11 @@ define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: v_select_sint_to_fp_i1_vals_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, 0xbff00000 +; GCN-NEXT: v_mov_b32_e32 v3, 0xbff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -495,10 +495,10 @@ define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0xbff00000 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc -; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX942-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 @@ -512,11 +512,11 @@ define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in ; GCN-LABEL: v_swap_select_sint_to_fp_i1_vals_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, 0xbff00000 +; GCN-NEXT: v_mov_b32_e32 v3, 0xbff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -525,10 +525,10 @@ define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0xbff00000 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e64 v5, v3, 0, vcc -; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX942-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index eb0d5465cacd..2040e2b26cb1 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -191,72 +191,72 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 { ; GFX908-LABEL: max_32regs_mfma32: ; GFX908: ; %bb.0: ; %bb -; GFX908-NEXT: v_mov_b32_e32 v2, 0x40400000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x40c00000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x40e00000 -; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x40a00000 -; GFX908-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a6, v4 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41000000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41100000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41200000 -; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a9, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41300000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41400000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41500000 -; GFX908-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a11, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41600000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41700000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41800000 -; GFX908-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a14, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a15, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41880000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41900000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41980000 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a17, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a18, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41a00000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41a80000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41b00000 -; GFX908-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a21, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41b80000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41c00000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41c80000 -; GFX908-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a23, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41d00000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41d80000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41e00000 -; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX908-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a26, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a27, v4 -; GFX908-NEXT: v_mov_b32_e32 v2, 0x41e80000 -; GFX908-NEXT: v_mov_b32_e32 v3, 0x41f00000 -; GFX908-NEXT: v_mov_b32_e32 v4, 0x41f80000 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x40400000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x40c00000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x40e00000 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x40a00000 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41000000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41100000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41200000 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41300000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41400000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41500000 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41600000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41700000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41800000 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41880000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41900000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41980000 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41a00000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41a80000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41b00000 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41b80000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41c00000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41c80000 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41d00000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41d80000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41e00000 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x41e80000 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41f00000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41f80000 +; GFX908-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, 2.0 ; GFX908-NEXT: v_accvgpr_write_b32 a3, 4.0 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a29, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a30, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a31, 2.0 ; GFX908-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31] +; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v4, a[0:31] ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 5 @@ -272,73 +272,73 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 { ; ; GFX90A-LABEL: max_32regs_mfma32: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40400000 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40a00000 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40c00000 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40e00000 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41000000 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41100000 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41200000 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41300000 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41400000 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41500000 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41600000 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41700000 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41800000 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41880000 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41900000 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41980000 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41a00000 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41a80000 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41b00000 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41b80000 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41c00000 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41c80000 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41d00000 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41d80000 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41e00000 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41e80000 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41f00000 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40400000 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40a00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40e00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41000000 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41100000 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41300000 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41400000 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41500000 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41600000 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41700000 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41800000 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41880000 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41900000 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41980000 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41a00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41a80000 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41b00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41b80000 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41c00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41c80000 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41d00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41d80000 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41e00000 +; GFX90A-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41e80000 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41f00000 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 2.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41f80000 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41f80000 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 4.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a1 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v2, a[0:31] ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 2 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index e64e3def98c2..c7b690fbd4a2 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1868,8 +1868,8 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll index c05f341f9e91..571c0f04c06c 100644 --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -412,8 +412,8 @@ define amdgpu_kernel void @v_lshr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) ; SI-LABEL: v_lshr_32_i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll index a69ee2e1a8b5..199ab49fa19d 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll @@ -63,7 +63,7 @@ define amdgpu_ps void @foo() #0 { ; GCN-NEXT: s_branch .LBB1_5 ; GCN-NEXT: .LBB1_4: ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: .LBB1_5: ; %bb14 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 0x41280000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index bc9a3f2389e7..bf1f6980fe25 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1186,8 +1186,8 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index 983acfc2c069..b31cc36a5f7c 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -660,11 +660,11 @@ define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: v_select_uint_to_fp_i1_vals_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, 0x3ff00000 +; GCN-NEXT: v_mov_b32_e32 v3, 0x3ff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -673,10 +673,10 @@ define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0x3ff00000 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc -; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX942-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 @@ -741,11 +741,11 @@ define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: v_select_uint_to_fp_i1_vals_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, 0x3ff00000 +; GCN-NEXT: v_mov_b32_e32 v3, 0x3ff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -754,10 +754,10 @@ define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0x3ff00000 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc -; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX942-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 @@ -823,11 +823,11 @@ define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in ; GCN-LABEL: v_swap_select_uint_to_fp_i1_vals_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, 0x3ff00000 +; GCN-NEXT: v_mov_b32_e32 v3, 0x3ff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -836,10 +836,10 @@ define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0x3ff00000 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e64 v5, v3, 0, vcc -; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX942-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll index b3166fa3f454..3e40f9a5b2b1 100644 --- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll +++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; CHECK-NEXT: v_pk_mov_b32 v[44:45], 0, 0 -; CHECK-NEXT: flat_load_dword v42, v[44:45] +; CHECK-NEXT: v_pk_mov_b32 v[46:47], 0, 0 +; CHECK-NEXT: flat_load_dword v42, v[46:47] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x8 @@ -26,17 +26,17 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_cselect_b32 s8, s64, 0 ; CHECK-NEXT: s_add_u32 s50, s34, 48 ; CHECK-NEXT: s_addc_u32 s51, s35, 0 -; CHECK-NEXT: v_pk_mov_b32 v[56:57], s[4:5], s[4:5] op_sel:[0,1] +; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[4:5], s[4:5] op_sel:[0,1] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, G@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, G@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: v_mov_b32_e32 v47, s7 +; CHECK-NEXT: v_mov_b32_e32 v57, s7 ; CHECK-NEXT: s_mov_b32 s7, s6 ; CHECK-NEXT: s_mov_b32 s53, s14 -; CHECK-NEXT: v_mov_b32_e32 v46, s8 -; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[6:7], s[6:7] op_sel:[0,1] +; CHECK-NEXT: v_mov_b32_e32 v56, s8 +; CHECK-NEXT: v_pk_mov_b32 v[60:61], s[6:7], s[6:7] op_sel:[0,1] ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51] @@ -49,13 +49,13 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_mov_b32 s52, s15 ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] ; CHECK-NEXT: v_mov_b32_e32 v40, v0 -; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59] +; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61] ; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] -; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[56:57] -; CHECK-NEXT: v_mov_b32_e32 v62, 0 -; CHECK-NEXT: v_mov_b32_e32 v63, 0x3ff00000 +; CHECK-NEXT: flat_load_dwordx2 v[62:63], v[58:59] +; CHECK-NEXT: v_mov_b32_e32 v44, 0 +; CHECK-NEXT: v_mov_b32_e32 v45, 0x3ff00000 ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51] @@ -64,30 +64,28 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_mov_b32 s13, s52 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 -; CHECK-NEXT: flat_store_dwordx2 v[44:45], v[62:63] -; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59] +; CHECK-NEXT: flat_store_dwordx2 v[46:47], v[44:45] +; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] -; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[46:47] glc +; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[56:57] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s64 ; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42 -; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[60:61] +; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[56:57], a[32:33] +; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33] ; CHECK-NEXT: buffer_store_dword a33, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v62, v0, s[0:3], 0 offen -; CHECK-NEXT: ; implicit-def: $vgpr4 +; CHECK-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_4 ; CHECK-NEXT: ; %bb.1: ; %LeafBlock5 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v42 -; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: ; %bb.2: ; %sw.bb17.i.i.i.i -; CHECK-NEXT: v_mov_b32_e32 v4, 1 +; CHECK-NEXT: v_mov_b32_e32 v44, 1 ; CHECK-NEXT: ; %bb.3: ; %Flow ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: .LBB0_4: ; %Flow8 @@ -105,10 +103,10 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: ; %bb.7: ; %Flow7 ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_mov_b32_e32 v44, 0 ; CHECK-NEXT: .LBB0_8: ; %bb.1 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v44 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.9: ; %sw.bb.i.i.i.i.i diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 464dad83f47c..c4d928185d8f 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -664,96 +664,94 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 ; GCN-LABEL: s_test_urem23_64_v2i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s6, s13, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GCN-NEXT: s_lshr_b32 s0, s15, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s0 -; GCN-NEXT: s_lshr_b32 s7, s11, 9 +; GCN-NEXT: s_lshr_b32 s4, s15, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GCN-NEXT: s_lshr_b32 s5, s11, 9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_sub_i32 s1, 0, s6 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GCN-NEXT: s_sub_i32 s8, 0, s6 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 -; GCN-NEXT: s_lshr_b32 s8, s9, 1 -; GCN-NEXT: v_mul_hi_u32 v5, v0, v5 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v4, s8, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 +; GCN-NEXT: s_lshr_b32 s7, s9, 1 +; GCN-NEXT: v_mul_hi_u32 v4, v0, v4 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v1, v1, s4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v1 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_mul_i32 s4, s4, s6 -; GCN-NEXT: s_sub_i32 s4, s8, s4 +; GCN-NEXT: s_sub_i32 s4, s7, s4 ; GCN-NEXT: s_sub_i32 s5, s4, s6 ; GCN-NEXT: s_cmp_ge_u32 s4, s6 ; GCN-NEXT: s_cselect_b32 s4, s5, s4 ; GCN-NEXT: s_sub_i32 s5, s4, s6 ; GCN-NEXT: s_cmp_ge_u32 s4, s6 ; GCN-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem23_64_v2i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s6, s13, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GCN-IR-NEXT: s_lshr_b32 s0, s15, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s0 -; GCN-IR-NEXT: s_lshr_b32 s7, s11, 9 +; GCN-IR-NEXT: s_lshr_b32 s4, s15, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GCN-IR-NEXT: s_lshr_b32 s5, s11, 9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-IR-NEXT: s_sub_i32 s1, 0, s6 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GCN-IR-NEXT: s_sub_i32 s8, 0, s6 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 -; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-IR-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GCN-IR-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 -; GCN-IR-NEXT: s_lshr_b32 s8, s9, 1 -; GCN-IR-NEXT: v_mul_hi_u32 v5, v0, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s8, v0 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s7, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-IR-NEXT: v_mad_f32 v2, -v3, v1, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v4, s8, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 +; GCN-IR-NEXT: s_lshr_b32 s7, s9, 1 +; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, s4 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 +; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffff, v1 ; GCN-IR-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-IR-NEXT: s_mul_i32 s4, s4, s6 -; GCN-IR-NEXT: s_sub_i32 s4, s8, s4 +; GCN-IR-NEXT: s_sub_i32 s4, s7, s4 ; GCN-IR-NEXT: s_sub_i32 s5, s4, s6 ; GCN-IR-NEXT: s_cmp_ge_u32 s4, s6 ; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 ; GCN-IR-NEXT: s_sub_i32 s5, s4, s6 ; GCN-IR-NEXT: s_cmp_ge_u32 s4, s6 ; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, @@ -1293,8 +1291,8 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index b314cf2e1d9c..53525c93f5b8 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -1545,9 +1545,9 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll index 0f368ffd33b9..d254b7effbfc 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll @@ -299,9 +299,9 @@ define amdgpu_kernel void @multi_vcond_loop(ptr addrspace(1) noalias nocapture % ; SI-LABEL: multi_vcond_loop: ; SI: ; %bb.0: ; %bb ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf -; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_mov_b32_e32 v7, 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[6:7], s[8:11], 0 addr64