From 6c6fb00c942ef1ca571bb376969c37f1e6e2ea1f Mon Sep 17 00:00:00 2001 From: Iasonaskrpr <126663668+Iasonaskrpr@users.noreply.github.com> Date: Sat, 7 Feb 2026 11:10:12 +0200 Subject: [PATCH] [AMDGPU] Optimize S_OR_B32 to S_ADDK_I32 where possible (#177949) This PR fixes #177753, converting disjoint S_OR_B32 to S_ADDK_I32 whenever possible, it avoids this transformation in case S_OR_B32 can be converted to bitset. Note on Test Failures (Draft Status) This change causes significant register reshuffling across the test suite due to the new allocation hints and the swaps performed in case src0 is not a register and src1, along with the change from or to addk. To avoid a massive, noisy diff during the initial logic review: This Draft PR only includes a representative sample of updated tests. CodeGen/AMDGPU/combine-reg-or-const.ll -> Showcases change from S_OR to S_ADDK CodeGen/AMDGPU/s-barrier.ll -> Showcases swap between Src0 and Src1 if src0 is not a register The rest of the tests show the result of the register allocation hint we give, I have checked every test I updated and they seem ok to me. Once the core logic is approved, I will run the update script across the remaining ~70 failing tests and mark the PR as "Ready for Review." --- .../Target/AMDGPU/SIShrinkInstructions.cpp | 31 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 66 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 50 +- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 1168 ++++++++--------- .../CodeGen/AMDGPU/combine-reg-or-const.ll | 2 +- llvm/test/CodeGen/AMDGPU/min.ll | 34 +- .../CodeGen/AMDGPU/s_or_b32_transformation.ll | 31 + 7 files changed, 701 insertions(+), 681 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/s_or_b32_transformation.ll diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 5b32bd0b72a5..14ed778f44f3 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -909,9 +909,21 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { } } + // Shrink scalar logic operations. + if (MI.getOpcode() == AMDGPU::S_AND_B32 || + MI.getOpcode() == AMDGPU::S_OR_B32 || + MI.getOpcode() == AMDGPU::S_XOR_B32) { + ChangeKind CK = shrinkScalarLogicOp(MI); + if (CK == ChangeKind::UpdateHint) + continue; + Changed |= (CK == ChangeKind::UpdateInst); + } + // Try to use S_ADDK_I32 and S_MULK_I32. if (MI.getOpcode() == AMDGPU::S_ADD_I32 || - MI.getOpcode() == AMDGPU::S_MUL_I32) { + MI.getOpcode() == AMDGPU::S_MUL_I32 || + (MI.getOpcode() == AMDGPU::S_OR_B32 && + MI.getFlag(MachineInstr::MIFlag::Disjoint))) { const MachineOperand *Dest = &MI.getOperand(0); MachineOperand *Src0 = &MI.getOperand(1); MachineOperand *Src1 = &MI.getOperand(2); @@ -931,12 +943,11 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); continue; } - if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { if (Src1->isImm() && isKImmOperand(*Src1)) { - unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? - AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; - + unsigned Opc = (MI.getOpcode() == AMDGPU::S_MUL_I32) + ? AMDGPU::S_MULK_I32 + : AMDGPU::S_ADDK_I32; Src1->setImm(SignExtend64(Src1->getImm(), 32)); MI.setDesc(TII->get(Opc)); MI.tieOperands(0, 1); @@ -974,16 +985,6 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { continue; } - // Shrink scalar logic operations. - if (MI.getOpcode() == AMDGPU::S_AND_B32 || - MI.getOpcode() == AMDGPU::S_OR_B32 || - MI.getOpcode() == AMDGPU::S_XOR_B32) { - ChangeKind CK = shrinkScalarLogicOp(MI); - if (CK == ChangeKind::UpdateHint) - continue; - Changed |= (CK == ChangeKind::UpdateInst); - } - if (IsPostRA && TII->isMIMG(MI.getOpcode()) && ST->getGeneration() >= AMDGPUSubtarget::GFX10) { Changed |= shrinkMIMG(MI); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 6ad73601859d..4fefef5e6615 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -664,8 +664,8 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX6-NEXT: s_lshr_b32 s1, s1, 1 ; GFX6-NEXT: s_lshl_b32 s2, s3, s2 ; GFX6-NEXT: s_lshr_b32 s1, s1, s4 -; GFX6-NEXT: s_or_b32 s1, s2, s1 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_or_b32 s2, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xff ; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -954,17 +954,17 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX6-NEXT: s_and_b32 s4, s8, 7 ; GFX6-NEXT: s_andn2_b32 s6, 7, s8 ; GFX6-NEXT: s_lshr_b32 s1, s1, 25 -; GFX6-NEXT: s_and_b32 s2, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s4, s5, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, s6 +; GFX6-NEXT: s_or_b32 s4, s4, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xff ; GFX6-NEXT: s_and_b32 s0, s0, 0xff -; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_or_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s3, 0xff -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s4, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -3689,8 +3689,8 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; GFX8-NEXT: s_lshr_b32 s1, s1, 17 ; GFX8-NEXT: s_lshl_b32 s2, s3, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, s4 -; GFX8-NEXT: s_or_b32 s1, s2, s1 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_or_b32 s2, s2, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -4268,15 +4268,15 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; GFX8-NEXT: s_lshr_b32 s2, s2, 17 ; GFX8-NEXT: s_lshl_b32 s4, s6, s4 ; GFX8-NEXT: s_lshr_b32 s2, s2, s7 -; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s4, s5, 15 +; GFX8-NEXT: s_or_b32 s4, s4, s2 +; GFX8-NEXT: s_and_b32 s2, s5, 15 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_lshr_b32 s3, s3, s5 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_or_b32 s1, s1, s3 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_lshr_b32 s2, s2, s5 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 @@ -4614,29 +4614,29 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX8-NEXT: s_lshr_b32 s2, s2, 17 ; GFX8-NEXT: s_lshl_b32 s4, s6, s4 ; GFX8-NEXT: s_lshr_b32 s2, s2, s8 -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s4, s5, 15 +; GFX8-NEXT: s_or_b32 s4, s4, s2 +; GFX8-NEXT: s_and_b32 s2, s5, 15 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s4, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s9, s5, 16 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 -; GFX8-NEXT: s_lshr_b32 s4, s4, 1 -; GFX8-NEXT: s_lshr_b32 s4, s4, s5 -; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s4, s9, 15 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_lshr_b32 s2, s2, s5 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s9, 15 ; GFX8-NEXT: s_andn2_b32 s5, 15, s9 ; GFX8-NEXT: s_lshr_b32 s3, s3, 17 -; GFX8-NEXT: s_lshl_b32 s4, s7, s4 +; GFX8-NEXT: s_lshl_b32 s2, s7, s2 ; GFX8-NEXT: s_lshr_b32 s3, s3, s5 +; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s3 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 5afab53628c3..bc6a2e7c4325 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -665,8 +665,8 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX6-NEXT: s_lshl_b32 s3, s3, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, s2 -; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_or_b32 s3, s3, s1 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -946,28 +946,28 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX6-NEXT: s_bfe_u32 s7, s1, 0x80008 ; GFX6-NEXT: s_lshr_b32 s2, s7, s2 ; GFX6-NEXT: s_lshr_b32 s6, s1, 24 -; GFX6-NEXT: s_or_b32 s2, s3, s2 -; GFX6-NEXT: s_and_b32 s3, s8, 7 +; GFX6-NEXT: s_or_b32 s3, s3, s2 +; GFX6-NEXT: s_and_b32 s2, s8, 7 ; GFX6-NEXT: s_andn2_b32 s7, 7, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX6-NEXT: s_lshl_b32 s4, s4, s7 -; GFX6-NEXT: s_lshr_b32 s1, s1, s3 -; GFX6-NEXT: s_or_b32 s1, s4, s1 -; GFX6-NEXT: s_and_b32 s3, s9, 7 -; GFX6-NEXT: s_andn2_b32 s4, 7, s9 +; GFX6-NEXT: s_lshr_b32 s1, s1, s2 +; GFX6-NEXT: s_or_b32 s4, s4, s1 +; GFX6-NEXT: s_and_b32 s1, s9, 7 +; GFX6-NEXT: s_andn2_b32 s2, 7, s9 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 -; GFX6-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NEXT: s_lshl_b32 s4, s5, s4 -; GFX6-NEXT: s_lshr_b32 s3, s6, s3 +; GFX6-NEXT: s_lshl_b32 s2, s5, s2 +; GFX6-NEXT: s_lshr_b32 s1, s6, s1 +; GFX6-NEXT: s_or_b32 s2, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: s_and_b32 s0, s0, 0xff -; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NEXT: s_or_b32 s3, s4, s3 -; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s4, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, 0xff +; GFX6-NEXT: s_and_b32 s1, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -3443,8 +3443,8 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_lshl_b32 s2, s3, s2 ; GFX8-NEXT: s_lshr_b32 s1, s4, s1 -; GFX8-NEXT: s_or_b32 s1, s2, s1 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_or_b32 s2, s2, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -4026,15 +4026,15 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; GFX8-NEXT: s_lshl_b32 s6, s6, 1 ; GFX8-NEXT: s_lshl_b32 s4, s6, s4 ; GFX8-NEXT: s_lshr_b32 s2, s7, s2 -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s4, s5, 15 +; GFX8-NEXT: s_or_b32 s4, s4, s2 +; GFX8-NEXT: s_and_b32 s2, s5, 15 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 -; GFX8-NEXT: s_lshr_b32 s3, s3, s4 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_or_b32 s1, s1, s3 +; GFX8-NEXT: s_lshr_b32 s2, s3, s2 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 @@ -4376,8 +4376,8 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX8-NEXT: s_lshl_b32 s6, s6, 1 ; GFX8-NEXT: s_lshl_b32 s4, s6, s4 ; GFX8-NEXT: s_lshr_b32 s2, s7, s2 -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_or_b32 s4, s4, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 6c09950689ef..5a06737d923f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -68693,7 +68693,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v40, s53, 13 ; SI-NEXT: v_writelane_b32 v40, s54, 14 ; SI-NEXT: v_writelane_b32 v40, s55, 15 -; SI-NEXT: s_mov_b32 s92, s16 ; SI-NEXT: v_writelane_b32 v40, s64, 16 ; SI-NEXT: v_writelane_b32 v40, s65, 17 ; SI-NEXT: v_writelane_b32 v40, s66, 18 @@ -68710,52 +68709,54 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: v_writelane_b32 v40, s86, 30 ; SI-NEXT: v_writelane_b32 v40, s87, 31 -; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v40, s96, 32 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v41, s23, 0 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v40, s97, 33 -; SI-NEXT: v_writelane_b32 v41, s21, 1 -; SI-NEXT: v_readfirstlane_b32 s47, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s28, 0 +; SI-NEXT: v_writelane_b32 v41, s26, 1 +; SI-NEXT: v_writelane_b32 v41, s23, 2 +; SI-NEXT: v_writelane_b32 v41, s22, 3 ; SI-NEXT: v_writelane_b32 v40, s98, 34 -; SI-NEXT: v_writelane_b32 v41, s47, 2 +; SI-NEXT: v_writelane_b32 v41, s21, 4 ; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: s_mov_b32 s88, s29 +; SI-NEXT: s_mov_b32 s30, s25 +; SI-NEXT: s_mov_b32 s29, s24 +; SI-NEXT: v_writelane_b32 v41, s20, 5 ; SI-NEXT: v_readfirstlane_b32 s82, v30 -; SI-NEXT: v_readfirstlane_b32 s83, v28 -; SI-NEXT: v_readfirstlane_b32 s44, v27 -; SI-NEXT: v_readfirstlane_b32 s96, v26 -; SI-NEXT: v_readfirstlane_b32 s70, v25 -; SI-NEXT: v_readfirstlane_b32 s68, v24 -; SI-NEXT: v_readfirstlane_b32 s84, v23 +; SI-NEXT: v_readfirstlane_b32 s57, v29 +; SI-NEXT: v_readfirstlane_b32 s34, v28 +; SI-NEXT: v_readfirstlane_b32 s83, v27 +; SI-NEXT: v_readfirstlane_b32 s46, v26 +; SI-NEXT: v_readfirstlane_b32 s68, v25 +; SI-NEXT: v_readfirstlane_b32 s52, v24 +; SI-NEXT: v_readfirstlane_b32 s81, v23 ; SI-NEXT: v_readfirstlane_b32 s65, v22 ; SI-NEXT: v_readfirstlane_b32 s86, v21 -; SI-NEXT: v_readfirstlane_b32 s66, v20 +; SI-NEXT: v_readfirstlane_b32 s84, v20 ; SI-NEXT: v_readfirstlane_b32 s87, v19 ; SI-NEXT: v_readfirstlane_b32 s80, v18 ; SI-NEXT: v_readfirstlane_b32 s36, v17 -; SI-NEXT: v_readfirstlane_b32 s31, v16 +; SI-NEXT: v_readfirstlane_b32 s97, v16 ; SI-NEXT: v_readfirstlane_b32 s64, v15 ; SI-NEXT: v_readfirstlane_b32 s38, v14 ; SI-NEXT: v_readfirstlane_b32 s67, v13 -; SI-NEXT: v_readfirstlane_b32 s34, v12 -; SI-NEXT: v_readfirstlane_b32 s71, v11 -; SI-NEXT: v_readfirstlane_b32 s81, v10 +; SI-NEXT: v_readfirstlane_b32 s48, v12 +; SI-NEXT: v_readfirstlane_b32 s70, v11 +; SI-NEXT: v_readfirstlane_b32 s71, v10 ; SI-NEXT: v_readfirstlane_b32 s37, v9 -; SI-NEXT: v_readfirstlane_b32 s35, v8 -; SI-NEXT: v_readfirstlane_b32 s49, v7 +; SI-NEXT: v_readfirstlane_b32 s28, v8 +; SI-NEXT: v_readfirstlane_b32 s93, v7 ; SI-NEXT: v_readfirstlane_b32 s94, v6 -; SI-NEXT: v_readfirstlane_b32 s51, v5 -; SI-NEXT: v_readfirstlane_b32 s88, v4 -; SI-NEXT: v_readfirstlane_b32 s53, v3 -; SI-NEXT: v_readfirstlane_b32 s54, v2 -; SI-NEXT: v_readfirstlane_b32 s89, v1 -; SI-NEXT: v_readfirstlane_b32 s90, v0 +; SI-NEXT: v_readfirstlane_b32 s49, v5 +; SI-NEXT: v_readfirstlane_b32 s95, v4 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s91, v31 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s16, v32 +; SI-NEXT: v_readfirstlane_b32 s89, v32 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s93, v33 +; SI-NEXT: v_readfirstlane_b32 s79, v33 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 @@ -68765,245 +68766,248 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s52, v34 +; SI-NEXT: v_readfirstlane_b32 s39, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s55, v35 +; SI-NEXT: v_readfirstlane_b32 s66, v35 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s79, v37 +; SI-NEXT: v_readfirstlane_b32 s96, v37 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 -; SI-NEXT: v_readfirstlane_b32 s50, v36 +; SI-NEXT: v_readfirstlane_b32 s55, v36 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s21, v38 +; SI-NEXT: v_readfirstlane_b32 s47, v38 +; SI-NEXT: v_readfirstlane_b32 s53, v3 +; SI-NEXT: v_readfirstlane_b32 s92, v2 +; SI-NEXT: v_readfirstlane_b32 s20, v1 +; SI-NEXT: v_readfirstlane_b32 s54, v0 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s56, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s85, v32 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s58, v33 +; SI-NEXT: v_readfirstlane_b32 s21, v33 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s98, v39 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s46, v48 +; SI-NEXT: v_readfirstlane_b32 s26, v48 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s99, v49 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s97, v50 +; SI-NEXT: v_readfirstlane_b32 s24, v50 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s9, v51 -; SI-NEXT: v_writelane_b32 v41, s58, 3 -; SI-NEXT: v_writelane_b32 v41, s9, 4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 +; SI-NEXT: v_readfirstlane_b32 s23, v51 +; SI-NEXT: v_writelane_b32 v41, s21, 6 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: v_readfirstlane_b32 s22, v34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s69, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s30, v36 +; SI-NEXT: v_readfirstlane_b32 s25, v36 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB99_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s12, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s19, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s24, 0xff -; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_and_b32 s5, s29, 0xff +; SI-NEXT: s_lshl_b32 s6, s30, 8 ; SI-NEXT: s_or_b32 s13, s5, s6 -; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_readlane_b32 s5, v41, 1 +; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s27, 24 ; SI-NEXT: s_or_b32 s6, s6, s5 -; SI-NEXT: s_and_b32 s5, s54, 0xff +; SI-NEXT: s_and_b32 s5, s92, 0xff ; SI-NEXT: s_lshl_b32 s7, s53, 8 ; SI-NEXT: s_or_b32 s14, s5, s7 -; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: s_and_b32 s5, s95, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s51, 24 +; SI-NEXT: s_lshl_b32 s7, s49, 24 ; SI-NEXT: s_or_b32 s8, s7, s5 -; SI-NEXT: s_and_b32 s5, s81, 0xff -; SI-NEXT: s_lshl_b32 s7, s71, 8 +; SI-NEXT: s_and_b32 s5, s71, 0xff +; SI-NEXT: s_lshl_b32 s7, s70, 8 ; SI-NEXT: s_or_b32 s15, s5, s7 -; SI-NEXT: s_and_b32 s5, s34, 0xff +; SI-NEXT: s_and_b32 s5, s48, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s7, s67, 24 ; SI-NEXT: s_or_b32 s10, s7, s5 ; SI-NEXT: s_and_b32 s5, s80, 0xff ; SI-NEXT: s_lshl_b32 s7, s87, 8 ; SI-NEXT: s_or_b32 s40, s5, s7 -; SI-NEXT: s_and_b32 s5, s66, 0xff +; SI-NEXT: s_and_b32 s5, s84, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s7, s86, 24 ; SI-NEXT: s_or_b32 s60, s7, s5 -; SI-NEXT: s_and_b32 s5, s96, 0xff -; SI-NEXT: s_lshl_b32 s7, s44, 8 +; SI-NEXT: s_and_b32 s5, s46, 0xff +; SI-NEXT: s_lshl_b32 s7, s83, 8 ; SI-NEXT: s_or_b32 s41, s5, s7 -; SI-NEXT: s_and_b32 s5, s9, 0xff -; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_and_b32 s5, s23, 0xff +; SI-NEXT: s_lshl_b32 s7, s24, 8 ; SI-NEXT: s_or_b32 s42, s5, s7 -; SI-NEXT: s_and_b32 s5, s21, 0xff -; SI-NEXT: s_lshl_b32 s7, s79, 8 +; SI-NEXT: s_and_b32 s5, s47, 0xff +; SI-NEXT: s_lshl_b32 s7, s96, 8 ; SI-NEXT: s_or_b32 s43, s5, s7 -; SI-NEXT: v_readlane_b32 s7, v41, 1 -; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: v_readlane_b32 s5, v41, 5 +; SI-NEXT: v_readlane_b32 s7, v41, 4 +; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_or_b32 s5, s5, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: v_readlane_b32 s9, v41, 0 +; SI-NEXT: v_readlane_b32 s7, v41, 3 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: v_readlane_b32 s9, v41, 2 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s9, s9, 24 -; SI-NEXT: s_or_b32 s57, s9, s7 -; SI-NEXT: s_and_b32 s7, s28, 0xff -; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: v_writelane_b32 v41, s46, 9 +; SI-NEXT: s_mov_b32 s56, s30 +; SI-NEXT: s_or_b32 s30, s9, s7 +; SI-NEXT: v_readlane_b32 s7, v41, 0 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_lshl_b32 s9, s88, 8 ; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s90, 0xff +; SI-NEXT: s_and_b32 s9, s54, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s89, 24 -; SI-NEXT: s_or_b32 s77, s11, s9 +; SI-NEXT: s_lshl_b32 s11, s20, 24 +; SI-NEXT: s_or_b32 s90, s11, s9 ; SI-NEXT: s_and_b32 s9, s94, 0xff -; SI-NEXT: s_lshl_b32 s11, s49, 8 +; SI-NEXT: s_lshl_b32 s11, s93, 8 ; SI-NEXT: s_or_b32 s9, s9, s11 -; SI-NEXT: s_and_b32 s11, s35, 0xff +; SI-NEXT: s_and_b32 s11, s28, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: v_writelane_b32 v41, s44, 11 ; SI-NEXT: s_lshl_b32 s44, s37, 24 ; SI-NEXT: s_or_b32 vcc_lo, s44, s11 ; SI-NEXT: s_and_b32 s11, s38, 0xff ; SI-NEXT: s_lshl_b32 s44, s64, 8 ; SI-NEXT: s_or_b32 s11, s11, s44 -; SI-NEXT: s_and_b32 s44, s31, 0xff +; SI-NEXT: s_and_b32 s44, s97, 0xff ; SI-NEXT: s_lshl_b32 s44, s44, 16 ; SI-NEXT: s_lshl_b32 s45, s36, 24 +; SI-NEXT: v_writelane_b32 v41, s23, 10 ; SI-NEXT: s_or_b32 vcc_hi, s45, s44 ; SI-NEXT: s_and_b32 s44, s65, 0xff -; SI-NEXT: s_lshl_b32 s45, s84, 8 +; SI-NEXT: s_lshl_b32 s45, s81, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: s_and_b32 s45, s68, 0xff +; SI-NEXT: s_and_b32 s45, s52, 0xff +; SI-NEXT: v_writelane_b32 v41, s92, 11 ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_mov_b32 s23, s21 -; SI-NEXT: s_mov_b32 s21, s46 -; SI-NEXT: s_lshl_b32 s46, s70, 24 +; SI-NEXT: s_lshl_b32 s46, s68, 24 +; SI-NEXT: v_writelane_b32 v41, s93, 12 ; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: v_writelane_b32 v41, s97, 12 -; SI-NEXT: s_mov_b32 s97, s86 -; SI-NEXT: s_mov_b32 s86, s84 -; SI-NEXT: s_mov_b32 s84, s70 -; SI-NEXT: s_mov_b32 s70, s34 -; SI-NEXT: s_mov_b32 s34, s88 -; SI-NEXT: s_mov_b32 s88, s24 -; SI-NEXT: s_or_b32 s24, s46, s45 -; SI-NEXT: s_or_b32 s61, s44, s24 +; SI-NEXT: v_writelane_b32 v41, s20, 13 +; SI-NEXT: s_or_b32 s20, s46, s45 +; SI-NEXT: s_or_b32 s61, s44, s20 ; SI-NEXT: s_and_b32 s44, s82, 0xff -; SI-NEXT: s_lshl_b32 s45, s30, 8 +; SI-NEXT: s_lshl_b32 s45, s25, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: s_and_b32 s45, s69, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s78, 24 -; SI-NEXT: s_mov_b32 s95, s90 -; SI-NEXT: s_mov_b32 s90, s18 +; SI-NEXT: s_lshl_b32 s46, s22, 24 +; SI-NEXT: v_writelane_b32 v41, s96, 14 +; SI-NEXT: s_mov_b32 s96, s84 +; SI-NEXT: s_mov_b32 s84, s67 +; SI-NEXT: s_mov_b32 s67, s49 +; SI-NEXT: s_mov_b32 s49, s18 ; SI-NEXT: s_or_b32 s18, s46, s45 -; SI-NEXT: s_and_b32 s45, s83, 0xff +; SI-NEXT: s_and_b32 s45, s34, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s47, 24 +; SI-NEXT: s_lshl_b32 s46, s57, 24 ; SI-NEXT: s_and_b32 s44, s44, 0xffff ; SI-NEXT: s_or_b32 s62, s46, s45 ; SI-NEXT: s_or_b32 s63, s44, s18 ; SI-NEXT: s_and_b32 s44, s98, 0xff -; SI-NEXT: s_lshl_b32 s45, s58, 8 +; SI-NEXT: s_lshl_b32 s45, s21, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: s_and_b32 s45, s85, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s56, 24 -; SI-NEXT: s_mov_b32 s76, s56 -; SI-NEXT: s_mov_b32 s56, s85 -; SI-NEXT: s_mov_b32 s85, s79 -; SI-NEXT: s_mov_b32 s79, s19 +; SI-NEXT: s_lshl_b32 s46, s59, 24 +; SI-NEXT: s_mov_b32 s23, s88 +; SI-NEXT: s_mov_b32 s88, s19 ; SI-NEXT: s_or_b32 s19, s46, s45 ; SI-NEXT: s_and_b32 s45, s99, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s21, 24 +; SI-NEXT: s_lshl_b32 s46, s26, 24 ; SI-NEXT: s_and_b32 s44, s44, 0xffff ; SI-NEXT: s_or_b32 s72, s46, s45 ; SI-NEXT: s_or_b32 s73, s44, s19 -; SI-NEXT: s_and_b32 s44, s52, 0xff -; SI-NEXT: s_lshl_b32 s45, s93, 8 +; SI-NEXT: s_and_b32 s44, s39, 0xff +; SI-NEXT: s_lshl_b32 s45, s79, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: s_and_b32 s45, s16, 0xff +; SI-NEXT: s_and_b32 s45, s89, 0xff +; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_lshl_b32 s45, s45, 16 ; SI-NEXT: s_lshl_b32 s46, s91, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_mov_b32 s47, s96 -; SI-NEXT: s_mov_b32 s96, s78 -; SI-NEXT: s_mov_b32 s78, s69 -; SI-NEXT: s_mov_b32 s69, s68 -; SI-NEXT: s_mov_b32 s68, s38 -; SI-NEXT: s_mov_b32 s38, s35 -; SI-NEXT: s_mov_b32 s35, s89 -; SI-NEXT: s_or_b32 s89, s46, s45 -; SI-NEXT: s_and_b32 s45, s50, 0xff -; SI-NEXT: s_or_b32 s5, s5, s57 -; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s55, 24 +; SI-NEXT: s_or_b32 s5, s5, s30 +; SI-NEXT: s_mov_b32 s77, s47 +; SI-NEXT: s_mov_b32 s47, s24 +; SI-NEXT: s_mov_b32 s24, s83 +; SI-NEXT: s_mov_b32 s83, s70 +; SI-NEXT: s_mov_b32 s70, s38 +; SI-NEXT: s_mov_b32 s38, s91 +; SI-NEXT: s_mov_b32 s91, s23 +; SI-NEXT: s_mov_b32 s23, s17 +; SI-NEXT: s_or_b32 s17, s46, s45 +; SI-NEXT: s_and_b32 s45, s55, 0xff ; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_or_b32 s74, s46, s45 -; SI-NEXT: s_mov_b32 s45, s83 -; SI-NEXT: s_mov_b32 s83, s91 -; SI-NEXT: s_mov_b32 s91, s28 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_mov_b32 s76, s59 +; SI-NEXT: s_mov_b32 s59, s57 +; SI-NEXT: s_mov_b32 s57, s34 +; SI-NEXT: s_mov_b32 s34, s82 +; SI-NEXT: s_mov_b32 s82, s52 +; SI-NEXT: s_mov_b32 s52, s95 +; SI-NEXT: s_mov_b32 s95, s27 +; SI-NEXT: s_mov_b32 s21, s22 +; SI-NEXT: s_mov_b32 s22, s69 +; SI-NEXT: s_mov_b32 s69, s48 +; SI-NEXT: s_mov_b32 s48, s37 +; SI-NEXT: s_mov_b32 s37, s56 +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s66, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_mov_b32 s56, s98 +; SI-NEXT: s_mov_b32 s98, s28 ; SI-NEXT: s_and_b32 s28, s42, 0xffff -; SI-NEXT: s_mov_b32 s59, s94 -; SI-NEXT: s_mov_b32 s94, s27 ; SI-NEXT: s_and_b32 s27, s43, 0xffff ; SI-NEXT: s_or_b32 s42, s12, s4 ; SI-NEXT: s_mov_b32 s43, s5 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: s_or_b32 s7, s7, s90 ; SI-NEXT: s_or_b32 s9, s9, vcc_lo -; SI-NEXT: v_writelane_b32 v41, s4, 5 -; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: v_writelane_b32 v41, s5, 6 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 16 ; SI-NEXT: s_or_b32 s11, s11, vcc_hi -; SI-NEXT: v_writelane_b32 v41, s4, 7 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: v_writelane_b32 v41, s5, 8 -; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 16 -; SI-NEXT: s_or_b32 s7, s7, s77 -; SI-NEXT: s_or_b32 s75, s44, s89 +; SI-NEXT: s_or_b32 s74, s46, s45 +; SI-NEXT: s_or_b32 s75, s44, s17 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_and_b32 s58, s15, 0xffff -; SI-NEXT: s_mov_b32 s44, s82 -; SI-NEXT: s_mov_b32 s82, s81 -; SI-NEXT: s_mov_b32 s81, s55 -; SI-NEXT: s_mov_b32 s55, s54 -; SI-NEXT: s_mov_b32 s54, s51 -; SI-NEXT: s_mov_b32 s51, s37 -; SI-NEXT: s_mov_b32 s37, s16 +; SI-NEXT: s_mov_b32 s92, s16 ; SI-NEXT: s_and_b32 s16, s40, 0xffff -; SI-NEXT: s_mov_b32 s46, s98 -; SI-NEXT: s_mov_b32 s98, s93 -; SI-NEXT: s_and_b32 s93, s41, 0xffff -; SI-NEXT: v_writelane_b32 v41, s4, 9 -; SI-NEXT: s_mov_b32 s39, s49 +; SI-NEXT: s_mov_b32 s31, s29 +; SI-NEXT: s_and_b32 s29, s41, 0xffff +; SI-NEXT: v_writelane_b32 v41, s4, 7 +; SI-NEXT: s_mov_b32 s93, s39 +; SI-NEXT: s_mov_b32 s39, s79 +; SI-NEXT: v_writelane_b32 v41, s5, 8 ; SI-NEXT: s_or_b32 s40, s13, s6 ; SI-NEXT: s_mov_b32 s41, s7 -; SI-NEXT: s_lshr_b64 s[48:49], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 16 ; SI-NEXT: s_or_b32 s14, s14, s8 ; SI-NEXT: s_mov_b32 s15, s9 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 16 ; SI-NEXT: s_or_b32 s12, s58, s10 ; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: v_writelane_b32 v41, s5, 10 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 ; SI-NEXT: s_or_b32 s10, s16, s60 ; SI-NEXT: s_mov_b32 s11, s61 ; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 -; SI-NEXT: s_or_b32 s8, s93, s62 +; SI-NEXT: s_or_b32 s8, s29, s62 ; SI-NEXT: s_mov_b32 s9, s63 ; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 ; SI-NEXT: s_or_b32 s6, s28, s72 @@ -69012,68 +69016,68 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s4, s27, s74 ; SI-NEXT: s_mov_b32 s5, s75 ; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 -; SI-NEXT: s_mov_b32 s16, s37 -; SI-NEXT: s_mov_b32 s37, s51 -; SI-NEXT: s_mov_b32 s51, s54 -; SI-NEXT: s_mov_b32 s54, s55 -; SI-NEXT: s_mov_b32 s55, s81 -; SI-NEXT: s_mov_b32 s81, s82 -; SI-NEXT: s_mov_b32 s82, s44 -; SI-NEXT: v_readlane_b32 s44, v41, 11 -; SI-NEXT: s_mov_b32 s93, s98 -; SI-NEXT: s_mov_b32 s98, s46 -; SI-NEXT: s_mov_b32 s46, s21 -; SI-NEXT: s_mov_b32 s21, s23 -; SI-NEXT: s_mov_b32 s28, s91 -; SI-NEXT: s_mov_b32 s91, s83 -; SI-NEXT: s_mov_b32 s83, s45 -; SI-NEXT: s_mov_b32 s27, s94 -; SI-NEXT: s_mov_b32 s94, s59 -; SI-NEXT: s_lshr_b32 s23, s57, 16 -; SI-NEXT: s_lshr_b32 s57, s77, 16 -; SI-NEXT: s_lshr_b32 s59, vcc_lo, 16 +; SI-NEXT: s_mov_b32 s16, s92 +; SI-NEXT: s_mov_b32 s79, s39 +; SI-NEXT: s_mov_b32 s39, s93 +; SI-NEXT: s_mov_b32 s29, s31 +; SI-NEXT: s_mov_b32 s28, s98 +; SI-NEXT: s_mov_b32 s98, s56 +; SI-NEXT: s_lshr_b32 s35, s30, 16 +; SI-NEXT: v_readlane_b32 s46, v41, 9 +; SI-NEXT: s_lshr_b32 s56, s90, 16 +; SI-NEXT: s_lshr_b32 s58, vcc_lo, 16 ; SI-NEXT: s_lshr_b32 s61, vcc_hi, 16 -; SI-NEXT: s_lshr_b32 s63, s24, 16 -; SI-NEXT: s_mov_b32 s24, s88 -; SI-NEXT: s_mov_b32 s88, s34 -; SI-NEXT: s_mov_b32 s34, s70 -; SI-NEXT: s_mov_b32 s70, s84 -; SI-NEXT: s_mov_b32 s84, s86 -; SI-NEXT: s_mov_b32 s86, s97 -; SI-NEXT: v_readlane_b32 s97, v41, 12 +; SI-NEXT: s_lshr_b32 s63, s20, 16 +; SI-NEXT: v_readlane_b32 s20, v41, 13 +; SI-NEXT: v_readlane_b32 s93, v41, 12 +; SI-NEXT: v_readlane_b32 s92, v41, 11 ; SI-NEXT: s_lshr_b32 s73, s18, 16 -; SI-NEXT: s_mov_b32 s18, s90 -; SI-NEXT: s_mov_b32 s90, s95 -; SI-NEXT: s_mov_b32 s49, s39 +; SI-NEXT: s_mov_b32 s18, s49 +; SI-NEXT: s_mov_b32 s49, s67 +; SI-NEXT: s_mov_b32 s67, s84 +; SI-NEXT: s_mov_b32 s84, s96 +; SI-NEXT: v_readlane_b32 s96, v41, 14 ; SI-NEXT: s_lshr_b32 s75, s19, 16 -; SI-NEXT: s_mov_b32 s19, s79 -; SI-NEXT: s_mov_b32 s79, s85 -; SI-NEXT: s_mov_b32 s85, s56 -; SI-NEXT: s_mov_b32 s56, s76 -; SI-NEXT: s_lshr_b32 s45, s89, 16 -; SI-NEXT: s_mov_b32 s89, s35 -; SI-NEXT: s_mov_b32 s35, s38 -; SI-NEXT: s_mov_b32 s38, s68 -; SI-NEXT: s_mov_b32 s68, s69 -; SI-NEXT: s_mov_b32 s69, s78 -; SI-NEXT: s_mov_b32 s78, s96 -; SI-NEXT: s_mov_b32 s96, s47 +; SI-NEXT: s_mov_b32 s19, s88 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_mov_b32 s17, s23 +; SI-NEXT: v_readlane_b32 s23, v41, 10 +; SI-NEXT: s_mov_b32 s30, s37 +; SI-NEXT: s_mov_b32 s37, s48 +; SI-NEXT: s_mov_b32 s48, s69 +; SI-NEXT: s_mov_b32 s69, s22 +; SI-NEXT: s_mov_b32 s22, s21 +; SI-NEXT: s_mov_b32 s27, s95 +; SI-NEXT: s_mov_b32 s95, s52 +; SI-NEXT: s_mov_b32 s52, s82 +; SI-NEXT: s_mov_b32 s82, s34 +; SI-NEXT: s_mov_b32 s88, s91 +; SI-NEXT: s_mov_b32 s91, s38 +; SI-NEXT: s_mov_b32 s38, s70 +; SI-NEXT: s_mov_b32 s70, s83 +; SI-NEXT: s_mov_b32 s83, s24 +; SI-NEXT: s_mov_b32 s24, s47 +; SI-NEXT: s_mov_b32 s47, s77 +; SI-NEXT: s_mov_b32 s34, s57 +; SI-NEXT: s_mov_b32 s57, s59 +; SI-NEXT: s_mov_b32 s59, s76 ; SI-NEXT: s_mov_b64 s[76:77], 0 ; SI-NEXT: s_branch .LBB99_3 ; SI-NEXT: .LBB99_2: ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_mov_b64 s[76:77], -1 -; SI-NEXT: v_writelane_b32 v41, s4, 5 -; SI-NEXT: v_writelane_b32 v41, s5, 6 -; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 7 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: v_writelane_b32 v41, s5, 8 +; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr60 @@ -69084,39 +69088,31 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: v_writelane_b32 v41, s4, 7 -; SI-NEXT: v_writelane_b32 v41, s5, 8 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 9 -; SI-NEXT: v_writelane_b32 v41, s5, 10 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: .LBB99_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[76:77] -; SI-NEXT: v_readlane_b32 s76, v41, 5 -; SI-NEXT: v_readlane_b32 s77, v41, 6 -; SI-NEXT: s_mov_b32 s58, s76 ; SI-NEXT: v_readlane_b32 s76, v41, 7 ; SI-NEXT: v_readlane_b32 s77, v41, 8 ; SI-NEXT: s_cbranch_vccnz .LBB99_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s21, s47, 3 ; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s79, 8 -; SI-NEXT: s_add_i32 s50, s50, 3 +; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_add_i32 s50, s55, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s50, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s55, 24 +; SI-NEXT: s_lshl_b32 s5, s66, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s39, s52, 3 +; SI-NEXT: s_add_i32 s39, s39, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s39, 0xff -; SI-NEXT: s_lshl_b32 s6, s93, 8 -; SI-NEXT: s_add_i32 s79, s16, 3 +; SI-NEXT: s_lshl_b32 s6, s79, 8 +; SI-NEXT: s_add_i32 s79, s89, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s7, s79, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 @@ -69124,21 +69120,20 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s6, v41, 4 -; SI-NEXT: s_add_i32 s23, s6, 3 ; SI-NEXT: s_and_b32 s6, s23, 0xff -; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_lshl_b32 s7, s24, 8 ; SI-NEXT: s_add_i32 s99, s99, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s8, s99, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s46, 24 +; SI-NEXT: s_lshl_b32 s7, s26, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_add_i32 s98, s98, 3 -; SI-NEXT: v_readlane_b32 s8, v41, 3 +; SI-NEXT: v_readlane_b32 s8, v41, 6 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s98, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 8 @@ -69146,32 +69141,31 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s9, s85, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_lshl_b32 s8, s56, 24 +; SI-NEXT: s_lshl_b32 s8, s59, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_add_i32 s96, s96, 3 +; SI-NEXT: s_add_i32 s96, s46, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s96, 0xff -; SI-NEXT: s_lshl_b32 s9, s44, 8 -; SI-NEXT: s_add_i32 s83, s83, 3 +; SI-NEXT: s_lshl_b32 s9, s83, 8 +; SI-NEXT: s_add_i32 s83, s34, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v41, 2 ; SI-NEXT: s_and_b32 s10, s83, 0xff ; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s9, s57, 24 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_add_i32 s82, s82, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_and_b32 s9, s82, 0xff -; SI-NEXT: s_lshl_b32 s10, s30, 8 +; SI-NEXT: s_lshl_b32 s10, s25, 8 ; SI-NEXT: s_add_i32 s69, s69, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s11, s69, 0xff ; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: s_lshl_b32 s10, s78, 24 +; SI-NEXT: s_lshl_b32 s10, s22, 24 ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s10, s10, s11 @@ -69179,7 +69173,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s80, 0xff ; SI-NEXT: s_lshl_b32 s11, s87, 8 -; SI-NEXT: s_add_i32 s66, s66, 3 +; SI-NEXT: s_add_i32 s66, s84, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_and_b32 s12, s66, 0xff ; SI-NEXT: s_addk_i32 s10, 0x300 @@ -69190,20 +69184,20 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s65, s65, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_and_b32 s11, s65, 0xff -; SI-NEXT: s_lshl_b32 s12, s84, 8 -; SI-NEXT: s_add_i32 s52, s68, 3 +; SI-NEXT: s_lshl_b32 s12, s81, 8 +; SI-NEXT: s_add_i32 s52, s52, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_and_b32 s13, s52, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 -; SI-NEXT: s_lshl_b32 s12, s70, 24 +; SI-NEXT: s_lshl_b32 s12, s68, 24 ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: s_add_i32 s55, s81, 3 +; SI-NEXT: s_add_i32 s55, s71, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_and_b32 s12, s55, 0xff -; SI-NEXT: s_lshl_b32 s13, s71, 8 -; SI-NEXT: s_add_i32 s48, s34, 3 +; SI-NEXT: s_lshl_b32 s13, s70, 8 +; SI-NEXT: s_add_i32 s48, s48, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: s_and_b32 s14, s48, 0xff ; SI-NEXT: s_addk_i32 s12, 0x300 @@ -69215,7 +69209,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: s_and_b32 s13, s38, 0xff ; SI-NEXT: s_lshl_b32 s14, s64, 8 -; SI-NEXT: s_add_i32 s31, s31, 3 +; SI-NEXT: s_add_i32 s31, s97, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_and_b32 s15, s31, 0xff ; SI-NEXT: s_addk_i32 s13, 0x300 @@ -69223,84 +69217,85 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: s_add_i32 s36, s54, 3 +; SI-NEXT: s_add_i32 s36, s92, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_and_b32 s14, s36, 0xff ; SI-NEXT: s_lshl_b32 s15, s53, 8 -; SI-NEXT: s_add_i32 s95, s88, 3 +; SI-NEXT: s_add_i32 s95, s95, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s21, s95, 0xff ; SI-NEXT: s_addk_i32 s14, 0x300 -; SI-NEXT: s_lshl_b32 s15, s51, 24 +; SI-NEXT: s_lshl_b32 s15, s49, 24 ; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s15, s15, s21 ; SI-NEXT: s_add_i32 s94, s94, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s15, s94, 0xff -; SI-NEXT: s_lshl_b32 s21, s49, 8 -; SI-NEXT: s_add_i32 s91, s35, 3 +; SI-NEXT: s_lshl_b32 s21, s93, 8 +; SI-NEXT: s_add_i32 s91, s28, 3 ; SI-NEXT: s_or_b32 s15, s21, s15 -; SI-NEXT: s_and_b32 s16, s91, 0xff +; SI-NEXT: s_and_b32 s22, s91, 0xff ; SI-NEXT: s_addk_i32 s15, 0x300 ; SI-NEXT: s_lshl_b32 s21, s37, 24 -; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_or_b32 s21, s21, s16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_add_i32 s24, s29, 3 +; SI-NEXT: v_readlane_b32 s23, v41, 1 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s15, s21, s15 ; SI-NEXT: s_and_b32 s21, s24, 0xff -; SI-NEXT: s_lshl_b32 s16, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s21, s16, s21 -; SI-NEXT: s_and_b32 s23, s26, 0xff -; SI-NEXT: s_addk_i32 s21, 0x300 -; SI-NEXT: s_lshl_b32 s16, s27, 24 -; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s23 -; SI-NEXT: s_or_b32 s21, s16, s21 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s40, s21, 0x3000000 -; SI-NEXT: s_and_b32 s21, s28, 0xff -; SI-NEXT: s_lshl_b32 s16, s29, 8 -; SI-NEXT: s_lshl_b32 s23, s89, 24 -; SI-NEXT: s_add_i32 s89, s90, 3 -; SI-NEXT: s_or_b32 s21, s16, s21 -; SI-NEXT: s_and_b32 s16, s89, 0xff -; SI-NEXT: s_addk_i32 s21, 0x300 -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s23, s16 -; SI-NEXT: s_or_b32 s16, s16, s21 -; SI-NEXT: s_add_i32 s41, s16, 0x3000000 -; SI-NEXT: s_add_i32 s16, s92, 3 +; SI-NEXT: s_lshl_b32 s22, s30, 8 +; SI-NEXT: s_add_i32 s26, s23, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_and_b32 s23, s26, 0xff ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s22, s27, 24 +; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s19, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s21, s22, s21 ; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s40, s21, 0x3000000 +; SI-NEXT: v_readlane_b32 s21, v41, 0 ; SI-NEXT: s_add_i32 s42, s16, 0x3000000 -; SI-NEXT: s_add_i32 s16, s20, 3 -; SI-NEXT: v_readlane_b32 s17, v41, 1 +; SI-NEXT: v_readlane_b32 s16, v41, 5 +; SI-NEXT: s_add_i32 s28, s21, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 4 +; SI-NEXT: v_readlane_b32 s18, v41, 3 +; SI-NEXT: s_and_b32 s21, s28, 0xff +; SI-NEXT: s_lshl_b32 s22, s88, 8 +; SI-NEXT: s_add_i32 s89, s54, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s22, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_and_b32 s22, s89, 0xff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v41, 0 +; SI-NEXT: v_readlane_b32 s17, v41, 2 ; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s20, s20, 24 +; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 @@ -69311,49 +69306,49 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s11, s11, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 0x3000000 -; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s14, s14, 0x3000000 ; SI-NEXT: s_add_i32 s15, s15, 0x3000000 -; SI-NEXT: s_lshr_b64 s[58:59], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: s_add_i32 s41, s20, 0x3000000 +; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[48:49], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 9 -; SI-NEXT: s_lshr_b32 s23, s43, 16 -; SI-NEXT: s_lshr_b32 s57, s41, 16 -; SI-NEXT: s_lshr_b32 s59, s15, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[50:51], s[14:15], 16 +; SI-NEXT: s_lshr_b32 s35, s43, 16 +; SI-NEXT: s_lshr_b32 s56, s41, 16 +; SI-NEXT: s_lshr_b32 s58, s15, 16 ; SI-NEXT: s_lshr_b32 s61, s13, 16 ; SI-NEXT: s_lshr_b32 s63, s11, 16 ; SI-NEXT: s_lshr_b32 s73, s9, 16 ; SI-NEXT: s_lshr_b32 s75, s7, 16 ; SI-NEXT: s_lshr_b32 s45, s5, 16 -; SI-NEXT: v_writelane_b32 v41, s17, 10 ; SI-NEXT: .LBB99_5: ; %end ; SI-NEXT: s_and_b32 s16, s42, 0xffff -; SI-NEXT: s_lshl_b32 s17, s58, 16 +; SI-NEXT: s_lshl_b32 s17, s76, 16 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s43, 0xffff -; SI-NEXT: s_lshl_b32 s18, s23, 16 +; SI-NEXT: s_lshl_b32 s18, s35, 16 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s40, 0xffff -; SI-NEXT: s_lshl_b32 s19, s48, 16 +; SI-NEXT: s_lshl_b32 s19, s78, 16 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s19, s41, 0xffff -; SI-NEXT: s_lshl_b32 s20, s57, 16 +; SI-NEXT: s_lshl_b32 s20, s56, 16 ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s20, s76, 16 +; SI-NEXT: s_lshl_b32 s20, s50, 16 ; SI-NEXT: s_or_b32 s14, s14, s20 ; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_lshl_b32 s20, s59, 16 +; SI-NEXT: s_lshl_b32 s20, s58, 16 ; SI-NEXT: s_or_b32 s15, s15, s20 -; SI-NEXT: v_readlane_b32 s20, v41, 9 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_lshl_b32 s20, s44, 16 ; SI-NEXT: s_or_b32 s12, s12, s20 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_lshl_b32 s20, s61, 16 @@ -69382,7 +69377,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_lshl_b32 s20, s45, 16 ; SI-NEXT: s_or_b32 s5, s5, s20 -; SI-NEXT: v_readlane_b32 s21, v41, 10 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 @@ -81308,7 +81302,6 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v40, s53, 13 ; SI-NEXT: v_writelane_b32 v40, s54, 14 ; SI-NEXT: v_writelane_b32 v40, s55, 15 -; SI-NEXT: s_mov_b32 s92, s16 ; SI-NEXT: v_writelane_b32 v40, s64, 16 ; SI-NEXT: v_writelane_b32 v40, s65, 17 ; SI-NEXT: v_writelane_b32 v40, s66, 18 @@ -81325,52 +81318,54 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: v_writelane_b32 v40, s86, 30 ; SI-NEXT: v_writelane_b32 v40, s87, 31 -; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v40, s96, 32 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v41, s23, 0 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v40, s97, 33 -; SI-NEXT: v_writelane_b32 v41, s21, 1 -; SI-NEXT: v_readfirstlane_b32 s47, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s28, 0 +; SI-NEXT: v_writelane_b32 v41, s26, 1 +; SI-NEXT: v_writelane_b32 v41, s23, 2 +; SI-NEXT: v_writelane_b32 v41, s22, 3 ; SI-NEXT: v_writelane_b32 v40, s98, 34 -; SI-NEXT: v_writelane_b32 v41, s47, 2 +; SI-NEXT: v_writelane_b32 v41, s21, 4 ; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: s_mov_b32 s88, s29 +; SI-NEXT: s_mov_b32 s30, s25 +; SI-NEXT: s_mov_b32 s29, s24 +; SI-NEXT: v_writelane_b32 v41, s20, 5 ; SI-NEXT: v_readfirstlane_b32 s82, v30 -; SI-NEXT: v_readfirstlane_b32 s83, v28 -; SI-NEXT: v_readfirstlane_b32 s44, v27 -; SI-NEXT: v_readfirstlane_b32 s96, v26 -; SI-NEXT: v_readfirstlane_b32 s70, v25 -; SI-NEXT: v_readfirstlane_b32 s68, v24 -; SI-NEXT: v_readfirstlane_b32 s84, v23 +; SI-NEXT: v_readfirstlane_b32 s57, v29 +; SI-NEXT: v_readfirstlane_b32 s34, v28 +; SI-NEXT: v_readfirstlane_b32 s83, v27 +; SI-NEXT: v_readfirstlane_b32 s46, v26 +; SI-NEXT: v_readfirstlane_b32 s68, v25 +; SI-NEXT: v_readfirstlane_b32 s52, v24 +; SI-NEXT: v_readfirstlane_b32 s81, v23 ; SI-NEXT: v_readfirstlane_b32 s65, v22 ; SI-NEXT: v_readfirstlane_b32 s86, v21 -; SI-NEXT: v_readfirstlane_b32 s66, v20 +; SI-NEXT: v_readfirstlane_b32 s84, v20 ; SI-NEXT: v_readfirstlane_b32 s87, v19 ; SI-NEXT: v_readfirstlane_b32 s80, v18 ; SI-NEXT: v_readfirstlane_b32 s36, v17 -; SI-NEXT: v_readfirstlane_b32 s31, v16 +; SI-NEXT: v_readfirstlane_b32 s97, v16 ; SI-NEXT: v_readfirstlane_b32 s64, v15 ; SI-NEXT: v_readfirstlane_b32 s38, v14 ; SI-NEXT: v_readfirstlane_b32 s67, v13 -; SI-NEXT: v_readfirstlane_b32 s34, v12 -; SI-NEXT: v_readfirstlane_b32 s71, v11 -; SI-NEXT: v_readfirstlane_b32 s81, v10 +; SI-NEXT: v_readfirstlane_b32 s48, v12 +; SI-NEXT: v_readfirstlane_b32 s70, v11 +; SI-NEXT: v_readfirstlane_b32 s71, v10 ; SI-NEXT: v_readfirstlane_b32 s37, v9 -; SI-NEXT: v_readfirstlane_b32 s35, v8 -; SI-NEXT: v_readfirstlane_b32 s49, v7 +; SI-NEXT: v_readfirstlane_b32 s28, v8 +; SI-NEXT: v_readfirstlane_b32 s93, v7 ; SI-NEXT: v_readfirstlane_b32 s94, v6 -; SI-NEXT: v_readfirstlane_b32 s51, v5 -; SI-NEXT: v_readfirstlane_b32 s88, v4 -; SI-NEXT: v_readfirstlane_b32 s53, v3 -; SI-NEXT: v_readfirstlane_b32 s54, v2 -; SI-NEXT: v_readfirstlane_b32 s89, v1 -; SI-NEXT: v_readfirstlane_b32 s90, v0 +; SI-NEXT: v_readfirstlane_b32 s49, v5 +; SI-NEXT: v_readfirstlane_b32 s95, v4 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s91, v31 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s16, v32 +; SI-NEXT: v_readfirstlane_b32 s89, v32 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s93, v33 +; SI-NEXT: v_readfirstlane_b32 s79, v33 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 @@ -81380,245 +81375,248 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s52, v34 +; SI-NEXT: v_readfirstlane_b32 s39, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s55, v35 +; SI-NEXT: v_readfirstlane_b32 s66, v35 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s79, v37 +; SI-NEXT: v_readfirstlane_b32 s96, v37 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 -; SI-NEXT: v_readfirstlane_b32 s50, v36 +; SI-NEXT: v_readfirstlane_b32 s55, v36 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s21, v38 +; SI-NEXT: v_readfirstlane_b32 s47, v38 +; SI-NEXT: v_readfirstlane_b32 s53, v3 +; SI-NEXT: v_readfirstlane_b32 s92, v2 +; SI-NEXT: v_readfirstlane_b32 s20, v1 +; SI-NEXT: v_readfirstlane_b32 s54, v0 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s56, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s85, v32 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s58, v33 +; SI-NEXT: v_readfirstlane_b32 s21, v33 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s98, v39 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s46, v48 +; SI-NEXT: v_readfirstlane_b32 s26, v48 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s99, v49 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s97, v50 +; SI-NEXT: v_readfirstlane_b32 s24, v50 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s9, v51 -; SI-NEXT: v_writelane_b32 v41, s58, 3 -; SI-NEXT: v_writelane_b32 v41, s9, 4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 +; SI-NEXT: v_readfirstlane_b32 s23, v51 +; SI-NEXT: v_writelane_b32 v41, s21, 6 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: v_readfirstlane_b32 s22, v34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s69, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s30, v36 +; SI-NEXT: v_readfirstlane_b32 s25, v36 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB107_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s12, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s19, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s24, 0xff -; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_and_b32 s5, s29, 0xff +; SI-NEXT: s_lshl_b32 s6, s30, 8 ; SI-NEXT: s_or_b32 s13, s5, s6 -; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_readlane_b32 s5, v41, 1 +; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s27, 24 ; SI-NEXT: s_or_b32 s6, s6, s5 -; SI-NEXT: s_and_b32 s5, s54, 0xff +; SI-NEXT: s_and_b32 s5, s92, 0xff ; SI-NEXT: s_lshl_b32 s7, s53, 8 ; SI-NEXT: s_or_b32 s14, s5, s7 -; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: s_and_b32 s5, s95, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s51, 24 +; SI-NEXT: s_lshl_b32 s7, s49, 24 ; SI-NEXT: s_or_b32 s8, s7, s5 -; SI-NEXT: s_and_b32 s5, s81, 0xff -; SI-NEXT: s_lshl_b32 s7, s71, 8 +; SI-NEXT: s_and_b32 s5, s71, 0xff +; SI-NEXT: s_lshl_b32 s7, s70, 8 ; SI-NEXT: s_or_b32 s15, s5, s7 -; SI-NEXT: s_and_b32 s5, s34, 0xff +; SI-NEXT: s_and_b32 s5, s48, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s7, s67, 24 ; SI-NEXT: s_or_b32 s10, s7, s5 ; SI-NEXT: s_and_b32 s5, s80, 0xff ; SI-NEXT: s_lshl_b32 s7, s87, 8 ; SI-NEXT: s_or_b32 s40, s5, s7 -; SI-NEXT: s_and_b32 s5, s66, 0xff +; SI-NEXT: s_and_b32 s5, s84, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s7, s86, 24 ; SI-NEXT: s_or_b32 s60, s7, s5 -; SI-NEXT: s_and_b32 s5, s96, 0xff -; SI-NEXT: s_lshl_b32 s7, s44, 8 +; SI-NEXT: s_and_b32 s5, s46, 0xff +; SI-NEXT: s_lshl_b32 s7, s83, 8 ; SI-NEXT: s_or_b32 s41, s5, s7 -; SI-NEXT: s_and_b32 s5, s9, 0xff -; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_and_b32 s5, s23, 0xff +; SI-NEXT: s_lshl_b32 s7, s24, 8 ; SI-NEXT: s_or_b32 s42, s5, s7 -; SI-NEXT: s_and_b32 s5, s21, 0xff -; SI-NEXT: s_lshl_b32 s7, s79, 8 +; SI-NEXT: s_and_b32 s5, s47, 0xff +; SI-NEXT: s_lshl_b32 s7, s96, 8 ; SI-NEXT: s_or_b32 s43, s5, s7 -; SI-NEXT: v_readlane_b32 s7, v41, 1 -; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: v_readlane_b32 s5, v41, 5 +; SI-NEXT: v_readlane_b32 s7, v41, 4 +; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_or_b32 s5, s5, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: v_readlane_b32 s9, v41, 0 +; SI-NEXT: v_readlane_b32 s7, v41, 3 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: v_readlane_b32 s9, v41, 2 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s9, s9, 24 -; SI-NEXT: s_or_b32 s57, s9, s7 -; SI-NEXT: s_and_b32 s7, s28, 0xff -; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: v_writelane_b32 v41, s46, 9 +; SI-NEXT: s_mov_b32 s56, s30 +; SI-NEXT: s_or_b32 s30, s9, s7 +; SI-NEXT: v_readlane_b32 s7, v41, 0 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_lshl_b32 s9, s88, 8 ; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s90, 0xff +; SI-NEXT: s_and_b32 s9, s54, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s89, 24 -; SI-NEXT: s_or_b32 s77, s11, s9 +; SI-NEXT: s_lshl_b32 s11, s20, 24 +; SI-NEXT: s_or_b32 s90, s11, s9 ; SI-NEXT: s_and_b32 s9, s94, 0xff -; SI-NEXT: s_lshl_b32 s11, s49, 8 +; SI-NEXT: s_lshl_b32 s11, s93, 8 ; SI-NEXT: s_or_b32 s9, s9, s11 -; SI-NEXT: s_and_b32 s11, s35, 0xff +; SI-NEXT: s_and_b32 s11, s28, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: v_writelane_b32 v41, s44, 11 ; SI-NEXT: s_lshl_b32 s44, s37, 24 ; SI-NEXT: s_or_b32 vcc_lo, s44, s11 ; SI-NEXT: s_and_b32 s11, s38, 0xff ; SI-NEXT: s_lshl_b32 s44, s64, 8 ; SI-NEXT: s_or_b32 s11, s11, s44 -; SI-NEXT: s_and_b32 s44, s31, 0xff +; SI-NEXT: s_and_b32 s44, s97, 0xff ; SI-NEXT: s_lshl_b32 s44, s44, 16 ; SI-NEXT: s_lshl_b32 s45, s36, 24 +; SI-NEXT: v_writelane_b32 v41, s23, 10 ; SI-NEXT: s_or_b32 vcc_hi, s45, s44 ; SI-NEXT: s_and_b32 s44, s65, 0xff -; SI-NEXT: s_lshl_b32 s45, s84, 8 +; SI-NEXT: s_lshl_b32 s45, s81, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: s_and_b32 s45, s68, 0xff +; SI-NEXT: s_and_b32 s45, s52, 0xff +; SI-NEXT: v_writelane_b32 v41, s92, 11 ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_mov_b32 s23, s21 -; SI-NEXT: s_mov_b32 s21, s46 -; SI-NEXT: s_lshl_b32 s46, s70, 24 +; SI-NEXT: s_lshl_b32 s46, s68, 24 +; SI-NEXT: v_writelane_b32 v41, s93, 12 ; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: v_writelane_b32 v41, s97, 12 -; SI-NEXT: s_mov_b32 s97, s86 -; SI-NEXT: s_mov_b32 s86, s84 -; SI-NEXT: s_mov_b32 s84, s70 -; SI-NEXT: s_mov_b32 s70, s34 -; SI-NEXT: s_mov_b32 s34, s88 -; SI-NEXT: s_mov_b32 s88, s24 -; SI-NEXT: s_or_b32 s24, s46, s45 -; SI-NEXT: s_or_b32 s61, s44, s24 +; SI-NEXT: v_writelane_b32 v41, s20, 13 +; SI-NEXT: s_or_b32 s20, s46, s45 +; SI-NEXT: s_or_b32 s61, s44, s20 ; SI-NEXT: s_and_b32 s44, s82, 0xff -; SI-NEXT: s_lshl_b32 s45, s30, 8 +; SI-NEXT: s_lshl_b32 s45, s25, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: s_and_b32 s45, s69, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s78, 24 -; SI-NEXT: s_mov_b32 s95, s90 -; SI-NEXT: s_mov_b32 s90, s18 +; SI-NEXT: s_lshl_b32 s46, s22, 24 +; SI-NEXT: v_writelane_b32 v41, s96, 14 +; SI-NEXT: s_mov_b32 s96, s84 +; SI-NEXT: s_mov_b32 s84, s67 +; SI-NEXT: s_mov_b32 s67, s49 +; SI-NEXT: s_mov_b32 s49, s18 ; SI-NEXT: s_or_b32 s18, s46, s45 -; SI-NEXT: s_and_b32 s45, s83, 0xff +; SI-NEXT: s_and_b32 s45, s34, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s47, 24 +; SI-NEXT: s_lshl_b32 s46, s57, 24 ; SI-NEXT: s_and_b32 s44, s44, 0xffff ; SI-NEXT: s_or_b32 s62, s46, s45 ; SI-NEXT: s_or_b32 s63, s44, s18 ; SI-NEXT: s_and_b32 s44, s98, 0xff -; SI-NEXT: s_lshl_b32 s45, s58, 8 +; SI-NEXT: s_lshl_b32 s45, s21, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: s_and_b32 s45, s85, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s56, 24 -; SI-NEXT: s_mov_b32 s76, s56 -; SI-NEXT: s_mov_b32 s56, s85 -; SI-NEXT: s_mov_b32 s85, s79 -; SI-NEXT: s_mov_b32 s79, s19 +; SI-NEXT: s_lshl_b32 s46, s59, 24 +; SI-NEXT: s_mov_b32 s23, s88 +; SI-NEXT: s_mov_b32 s88, s19 ; SI-NEXT: s_or_b32 s19, s46, s45 ; SI-NEXT: s_and_b32 s45, s99, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s21, 24 +; SI-NEXT: s_lshl_b32 s46, s26, 24 ; SI-NEXT: s_and_b32 s44, s44, 0xffff ; SI-NEXT: s_or_b32 s72, s46, s45 ; SI-NEXT: s_or_b32 s73, s44, s19 -; SI-NEXT: s_and_b32 s44, s52, 0xff -; SI-NEXT: s_lshl_b32 s45, s93, 8 +; SI-NEXT: s_and_b32 s44, s39, 0xff +; SI-NEXT: s_lshl_b32 s45, s79, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: s_and_b32 s45, s16, 0xff +; SI-NEXT: s_and_b32 s45, s89, 0xff +; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_lshl_b32 s45, s45, 16 ; SI-NEXT: s_lshl_b32 s46, s91, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_mov_b32 s47, s96 -; SI-NEXT: s_mov_b32 s96, s78 -; SI-NEXT: s_mov_b32 s78, s69 -; SI-NEXT: s_mov_b32 s69, s68 -; SI-NEXT: s_mov_b32 s68, s38 -; SI-NEXT: s_mov_b32 s38, s35 -; SI-NEXT: s_mov_b32 s35, s89 -; SI-NEXT: s_or_b32 s89, s46, s45 -; SI-NEXT: s_and_b32 s45, s50, 0xff -; SI-NEXT: s_or_b32 s5, s5, s57 -; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s55, 24 +; SI-NEXT: s_or_b32 s5, s5, s30 +; SI-NEXT: s_mov_b32 s77, s47 +; SI-NEXT: s_mov_b32 s47, s24 +; SI-NEXT: s_mov_b32 s24, s83 +; SI-NEXT: s_mov_b32 s83, s70 +; SI-NEXT: s_mov_b32 s70, s38 +; SI-NEXT: s_mov_b32 s38, s91 +; SI-NEXT: s_mov_b32 s91, s23 +; SI-NEXT: s_mov_b32 s23, s17 +; SI-NEXT: s_or_b32 s17, s46, s45 +; SI-NEXT: s_and_b32 s45, s55, 0xff ; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_or_b32 s74, s46, s45 -; SI-NEXT: s_mov_b32 s45, s83 -; SI-NEXT: s_mov_b32 s83, s91 -; SI-NEXT: s_mov_b32 s91, s28 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_mov_b32 s76, s59 +; SI-NEXT: s_mov_b32 s59, s57 +; SI-NEXT: s_mov_b32 s57, s34 +; SI-NEXT: s_mov_b32 s34, s82 +; SI-NEXT: s_mov_b32 s82, s52 +; SI-NEXT: s_mov_b32 s52, s95 +; SI-NEXT: s_mov_b32 s95, s27 +; SI-NEXT: s_mov_b32 s21, s22 +; SI-NEXT: s_mov_b32 s22, s69 +; SI-NEXT: s_mov_b32 s69, s48 +; SI-NEXT: s_mov_b32 s48, s37 +; SI-NEXT: s_mov_b32 s37, s56 +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s66, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_mov_b32 s56, s98 +; SI-NEXT: s_mov_b32 s98, s28 ; SI-NEXT: s_and_b32 s28, s42, 0xffff -; SI-NEXT: s_mov_b32 s59, s94 -; SI-NEXT: s_mov_b32 s94, s27 ; SI-NEXT: s_and_b32 s27, s43, 0xffff ; SI-NEXT: s_or_b32 s42, s12, s4 ; SI-NEXT: s_mov_b32 s43, s5 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: s_or_b32 s7, s7, s90 ; SI-NEXT: s_or_b32 s9, s9, vcc_lo -; SI-NEXT: v_writelane_b32 v41, s4, 5 -; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: v_writelane_b32 v41, s5, 6 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 16 ; SI-NEXT: s_or_b32 s11, s11, vcc_hi -; SI-NEXT: v_writelane_b32 v41, s4, 7 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: v_writelane_b32 v41, s5, 8 -; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 16 -; SI-NEXT: s_or_b32 s7, s7, s77 -; SI-NEXT: s_or_b32 s75, s44, s89 +; SI-NEXT: s_or_b32 s74, s46, s45 +; SI-NEXT: s_or_b32 s75, s44, s17 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_and_b32 s58, s15, 0xffff -; SI-NEXT: s_mov_b32 s44, s82 -; SI-NEXT: s_mov_b32 s82, s81 -; SI-NEXT: s_mov_b32 s81, s55 -; SI-NEXT: s_mov_b32 s55, s54 -; SI-NEXT: s_mov_b32 s54, s51 -; SI-NEXT: s_mov_b32 s51, s37 -; SI-NEXT: s_mov_b32 s37, s16 +; SI-NEXT: s_mov_b32 s92, s16 ; SI-NEXT: s_and_b32 s16, s40, 0xffff -; SI-NEXT: s_mov_b32 s46, s98 -; SI-NEXT: s_mov_b32 s98, s93 -; SI-NEXT: s_and_b32 s93, s41, 0xffff -; SI-NEXT: v_writelane_b32 v41, s4, 9 -; SI-NEXT: s_mov_b32 s39, s49 +; SI-NEXT: s_mov_b32 s31, s29 +; SI-NEXT: s_and_b32 s29, s41, 0xffff +; SI-NEXT: v_writelane_b32 v41, s4, 7 +; SI-NEXT: s_mov_b32 s93, s39 +; SI-NEXT: s_mov_b32 s39, s79 +; SI-NEXT: v_writelane_b32 v41, s5, 8 ; SI-NEXT: s_or_b32 s40, s13, s6 ; SI-NEXT: s_mov_b32 s41, s7 -; SI-NEXT: s_lshr_b64 s[48:49], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 16 ; SI-NEXT: s_or_b32 s14, s14, s8 ; SI-NEXT: s_mov_b32 s15, s9 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 16 ; SI-NEXT: s_or_b32 s12, s58, s10 ; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: v_writelane_b32 v41, s5, 10 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 ; SI-NEXT: s_or_b32 s10, s16, s60 ; SI-NEXT: s_mov_b32 s11, s61 ; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 -; SI-NEXT: s_or_b32 s8, s93, s62 +; SI-NEXT: s_or_b32 s8, s29, s62 ; SI-NEXT: s_mov_b32 s9, s63 ; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 ; SI-NEXT: s_or_b32 s6, s28, s72 @@ -81627,68 +81625,68 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s4, s27, s74 ; SI-NEXT: s_mov_b32 s5, s75 ; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 -; SI-NEXT: s_mov_b32 s16, s37 -; SI-NEXT: s_mov_b32 s37, s51 -; SI-NEXT: s_mov_b32 s51, s54 -; SI-NEXT: s_mov_b32 s54, s55 -; SI-NEXT: s_mov_b32 s55, s81 -; SI-NEXT: s_mov_b32 s81, s82 -; SI-NEXT: s_mov_b32 s82, s44 -; SI-NEXT: v_readlane_b32 s44, v41, 11 -; SI-NEXT: s_mov_b32 s93, s98 -; SI-NEXT: s_mov_b32 s98, s46 -; SI-NEXT: s_mov_b32 s46, s21 -; SI-NEXT: s_mov_b32 s21, s23 -; SI-NEXT: s_mov_b32 s28, s91 -; SI-NEXT: s_mov_b32 s91, s83 -; SI-NEXT: s_mov_b32 s83, s45 -; SI-NEXT: s_mov_b32 s27, s94 -; SI-NEXT: s_mov_b32 s94, s59 -; SI-NEXT: s_lshr_b32 s23, s57, 16 -; SI-NEXT: s_lshr_b32 s57, s77, 16 -; SI-NEXT: s_lshr_b32 s59, vcc_lo, 16 +; SI-NEXT: s_mov_b32 s16, s92 +; SI-NEXT: s_mov_b32 s79, s39 +; SI-NEXT: s_mov_b32 s39, s93 +; SI-NEXT: s_mov_b32 s29, s31 +; SI-NEXT: s_mov_b32 s28, s98 +; SI-NEXT: s_mov_b32 s98, s56 +; SI-NEXT: s_lshr_b32 s35, s30, 16 +; SI-NEXT: v_readlane_b32 s46, v41, 9 +; SI-NEXT: s_lshr_b32 s56, s90, 16 +; SI-NEXT: s_lshr_b32 s58, vcc_lo, 16 ; SI-NEXT: s_lshr_b32 s61, vcc_hi, 16 -; SI-NEXT: s_lshr_b32 s63, s24, 16 -; SI-NEXT: s_mov_b32 s24, s88 -; SI-NEXT: s_mov_b32 s88, s34 -; SI-NEXT: s_mov_b32 s34, s70 -; SI-NEXT: s_mov_b32 s70, s84 -; SI-NEXT: s_mov_b32 s84, s86 -; SI-NEXT: s_mov_b32 s86, s97 -; SI-NEXT: v_readlane_b32 s97, v41, 12 +; SI-NEXT: s_lshr_b32 s63, s20, 16 +; SI-NEXT: v_readlane_b32 s20, v41, 13 +; SI-NEXT: v_readlane_b32 s93, v41, 12 +; SI-NEXT: v_readlane_b32 s92, v41, 11 ; SI-NEXT: s_lshr_b32 s73, s18, 16 -; SI-NEXT: s_mov_b32 s18, s90 -; SI-NEXT: s_mov_b32 s90, s95 -; SI-NEXT: s_mov_b32 s49, s39 +; SI-NEXT: s_mov_b32 s18, s49 +; SI-NEXT: s_mov_b32 s49, s67 +; SI-NEXT: s_mov_b32 s67, s84 +; SI-NEXT: s_mov_b32 s84, s96 +; SI-NEXT: v_readlane_b32 s96, v41, 14 ; SI-NEXT: s_lshr_b32 s75, s19, 16 -; SI-NEXT: s_mov_b32 s19, s79 -; SI-NEXT: s_mov_b32 s79, s85 -; SI-NEXT: s_mov_b32 s85, s56 -; SI-NEXT: s_mov_b32 s56, s76 -; SI-NEXT: s_lshr_b32 s45, s89, 16 -; SI-NEXT: s_mov_b32 s89, s35 -; SI-NEXT: s_mov_b32 s35, s38 -; SI-NEXT: s_mov_b32 s38, s68 -; SI-NEXT: s_mov_b32 s68, s69 -; SI-NEXT: s_mov_b32 s69, s78 -; SI-NEXT: s_mov_b32 s78, s96 -; SI-NEXT: s_mov_b32 s96, s47 +; SI-NEXT: s_mov_b32 s19, s88 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_mov_b32 s17, s23 +; SI-NEXT: v_readlane_b32 s23, v41, 10 +; SI-NEXT: s_mov_b32 s30, s37 +; SI-NEXT: s_mov_b32 s37, s48 +; SI-NEXT: s_mov_b32 s48, s69 +; SI-NEXT: s_mov_b32 s69, s22 +; SI-NEXT: s_mov_b32 s22, s21 +; SI-NEXT: s_mov_b32 s27, s95 +; SI-NEXT: s_mov_b32 s95, s52 +; SI-NEXT: s_mov_b32 s52, s82 +; SI-NEXT: s_mov_b32 s82, s34 +; SI-NEXT: s_mov_b32 s88, s91 +; SI-NEXT: s_mov_b32 s91, s38 +; SI-NEXT: s_mov_b32 s38, s70 +; SI-NEXT: s_mov_b32 s70, s83 +; SI-NEXT: s_mov_b32 s83, s24 +; SI-NEXT: s_mov_b32 s24, s47 +; SI-NEXT: s_mov_b32 s47, s77 +; SI-NEXT: s_mov_b32 s34, s57 +; SI-NEXT: s_mov_b32 s57, s59 +; SI-NEXT: s_mov_b32 s59, s76 ; SI-NEXT: s_mov_b64 s[76:77], 0 ; SI-NEXT: s_branch .LBB107_3 ; SI-NEXT: .LBB107_2: ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_mov_b64 s[76:77], -1 -; SI-NEXT: v_writelane_b32 v41, s4, 5 -; SI-NEXT: v_writelane_b32 v41, s5, 6 -; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 7 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: v_writelane_b32 v41, s5, 8 +; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr60 @@ -81699,39 +81697,31 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: v_writelane_b32 v41, s4, 7 -; SI-NEXT: v_writelane_b32 v41, s5, 8 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 9 -; SI-NEXT: v_writelane_b32 v41, s5, 10 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: .LBB107_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[76:77] -; SI-NEXT: v_readlane_b32 s76, v41, 5 -; SI-NEXT: v_readlane_b32 s77, v41, 6 -; SI-NEXT: s_mov_b32 s58, s76 ; SI-NEXT: v_readlane_b32 s76, v41, 7 ; SI-NEXT: v_readlane_b32 s77, v41, 8 ; SI-NEXT: s_cbranch_vccnz .LBB107_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s21, s47, 3 ; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s79, 8 -; SI-NEXT: s_add_i32 s50, s50, 3 +; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_add_i32 s50, s55, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s50, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s55, 24 +; SI-NEXT: s_lshl_b32 s5, s66, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s39, s52, 3 +; SI-NEXT: s_add_i32 s39, s39, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s39, 0xff -; SI-NEXT: s_lshl_b32 s6, s93, 8 -; SI-NEXT: s_add_i32 s79, s16, 3 +; SI-NEXT: s_lshl_b32 s6, s79, 8 +; SI-NEXT: s_add_i32 s79, s89, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s7, s79, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 @@ -81739,21 +81729,20 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s6, v41, 4 -; SI-NEXT: s_add_i32 s23, s6, 3 ; SI-NEXT: s_and_b32 s6, s23, 0xff -; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_lshl_b32 s7, s24, 8 ; SI-NEXT: s_add_i32 s99, s99, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s8, s99, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s46, 24 +; SI-NEXT: s_lshl_b32 s7, s26, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_add_i32 s98, s98, 3 -; SI-NEXT: v_readlane_b32 s8, v41, 3 +; SI-NEXT: v_readlane_b32 s8, v41, 6 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s98, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 8 @@ -81761,32 +81750,31 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s9, s85, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_lshl_b32 s8, s56, 24 +; SI-NEXT: s_lshl_b32 s8, s59, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_add_i32 s96, s96, 3 +; SI-NEXT: s_add_i32 s96, s46, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s96, 0xff -; SI-NEXT: s_lshl_b32 s9, s44, 8 -; SI-NEXT: s_add_i32 s83, s83, 3 +; SI-NEXT: s_lshl_b32 s9, s83, 8 +; SI-NEXT: s_add_i32 s83, s34, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v41, 2 ; SI-NEXT: s_and_b32 s10, s83, 0xff ; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s9, s57, 24 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_add_i32 s82, s82, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_and_b32 s9, s82, 0xff -; SI-NEXT: s_lshl_b32 s10, s30, 8 +; SI-NEXT: s_lshl_b32 s10, s25, 8 ; SI-NEXT: s_add_i32 s69, s69, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s11, s69, 0xff ; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: s_lshl_b32 s10, s78, 24 +; SI-NEXT: s_lshl_b32 s10, s22, 24 ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s10, s10, s11 @@ -81794,7 +81782,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s80, 0xff ; SI-NEXT: s_lshl_b32 s11, s87, 8 -; SI-NEXT: s_add_i32 s66, s66, 3 +; SI-NEXT: s_add_i32 s66, s84, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_and_b32 s12, s66, 0xff ; SI-NEXT: s_addk_i32 s10, 0x300 @@ -81805,20 +81793,20 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s65, s65, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_and_b32 s11, s65, 0xff -; SI-NEXT: s_lshl_b32 s12, s84, 8 -; SI-NEXT: s_add_i32 s52, s68, 3 +; SI-NEXT: s_lshl_b32 s12, s81, 8 +; SI-NEXT: s_add_i32 s52, s52, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_and_b32 s13, s52, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 -; SI-NEXT: s_lshl_b32 s12, s70, 24 +; SI-NEXT: s_lshl_b32 s12, s68, 24 ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: s_add_i32 s55, s81, 3 +; SI-NEXT: s_add_i32 s55, s71, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_and_b32 s12, s55, 0xff -; SI-NEXT: s_lshl_b32 s13, s71, 8 -; SI-NEXT: s_add_i32 s48, s34, 3 +; SI-NEXT: s_lshl_b32 s13, s70, 8 +; SI-NEXT: s_add_i32 s48, s48, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: s_and_b32 s14, s48, 0xff ; SI-NEXT: s_addk_i32 s12, 0x300 @@ -81830,7 +81818,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: s_and_b32 s13, s38, 0xff ; SI-NEXT: s_lshl_b32 s14, s64, 8 -; SI-NEXT: s_add_i32 s31, s31, 3 +; SI-NEXT: s_add_i32 s31, s97, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_and_b32 s15, s31, 0xff ; SI-NEXT: s_addk_i32 s13, 0x300 @@ -81838,84 +81826,85 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: s_add_i32 s36, s54, 3 +; SI-NEXT: s_add_i32 s36, s92, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_and_b32 s14, s36, 0xff ; SI-NEXT: s_lshl_b32 s15, s53, 8 -; SI-NEXT: s_add_i32 s95, s88, 3 +; SI-NEXT: s_add_i32 s95, s95, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s21, s95, 0xff ; SI-NEXT: s_addk_i32 s14, 0x300 -; SI-NEXT: s_lshl_b32 s15, s51, 24 +; SI-NEXT: s_lshl_b32 s15, s49, 24 ; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s15, s15, s21 ; SI-NEXT: s_add_i32 s94, s94, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s15, s94, 0xff -; SI-NEXT: s_lshl_b32 s21, s49, 8 -; SI-NEXT: s_add_i32 s91, s35, 3 +; SI-NEXT: s_lshl_b32 s21, s93, 8 +; SI-NEXT: s_add_i32 s91, s28, 3 ; SI-NEXT: s_or_b32 s15, s21, s15 -; SI-NEXT: s_and_b32 s16, s91, 0xff +; SI-NEXT: s_and_b32 s22, s91, 0xff ; SI-NEXT: s_addk_i32 s15, 0x300 ; SI-NEXT: s_lshl_b32 s21, s37, 24 -; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_or_b32 s21, s21, s16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_add_i32 s24, s29, 3 +; SI-NEXT: v_readlane_b32 s23, v41, 1 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s15, s21, s15 ; SI-NEXT: s_and_b32 s21, s24, 0xff -; SI-NEXT: s_lshl_b32 s16, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s21, s16, s21 -; SI-NEXT: s_and_b32 s23, s26, 0xff -; SI-NEXT: s_addk_i32 s21, 0x300 -; SI-NEXT: s_lshl_b32 s16, s27, 24 -; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s23 -; SI-NEXT: s_or_b32 s21, s16, s21 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s40, s21, 0x3000000 -; SI-NEXT: s_and_b32 s21, s28, 0xff -; SI-NEXT: s_lshl_b32 s16, s29, 8 -; SI-NEXT: s_lshl_b32 s23, s89, 24 -; SI-NEXT: s_add_i32 s89, s90, 3 -; SI-NEXT: s_or_b32 s21, s16, s21 -; SI-NEXT: s_and_b32 s16, s89, 0xff -; SI-NEXT: s_addk_i32 s21, 0x300 -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s23, s16 -; SI-NEXT: s_or_b32 s16, s16, s21 -; SI-NEXT: s_add_i32 s41, s16, 0x3000000 -; SI-NEXT: s_add_i32 s16, s92, 3 +; SI-NEXT: s_lshl_b32 s22, s30, 8 +; SI-NEXT: s_add_i32 s26, s23, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_and_b32 s23, s26, 0xff ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s22, s27, 24 +; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s19, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s21, s22, s21 ; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s40, s21, 0x3000000 +; SI-NEXT: v_readlane_b32 s21, v41, 0 ; SI-NEXT: s_add_i32 s42, s16, 0x3000000 -; SI-NEXT: s_add_i32 s16, s20, 3 -; SI-NEXT: v_readlane_b32 s17, v41, 1 +; SI-NEXT: v_readlane_b32 s16, v41, 5 +; SI-NEXT: s_add_i32 s28, s21, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 4 +; SI-NEXT: v_readlane_b32 s18, v41, 3 +; SI-NEXT: s_and_b32 s21, s28, 0xff +; SI-NEXT: s_lshl_b32 s22, s88, 8 +; SI-NEXT: s_add_i32 s89, s54, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s22, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_and_b32 s22, s89, 0xff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v41, 0 +; SI-NEXT: v_readlane_b32 s17, v41, 2 ; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s20, s20, 24 +; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 @@ -81926,49 +81915,49 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s11, s11, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 0x3000000 -; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s14, s14, 0x3000000 ; SI-NEXT: s_add_i32 s15, s15, 0x3000000 -; SI-NEXT: s_lshr_b64 s[58:59], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: s_add_i32 s41, s20, 0x3000000 +; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[48:49], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 9 -; SI-NEXT: s_lshr_b32 s23, s43, 16 -; SI-NEXT: s_lshr_b32 s57, s41, 16 -; SI-NEXT: s_lshr_b32 s59, s15, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[50:51], s[14:15], 16 +; SI-NEXT: s_lshr_b32 s35, s43, 16 +; SI-NEXT: s_lshr_b32 s56, s41, 16 +; SI-NEXT: s_lshr_b32 s58, s15, 16 ; SI-NEXT: s_lshr_b32 s61, s13, 16 ; SI-NEXT: s_lshr_b32 s63, s11, 16 ; SI-NEXT: s_lshr_b32 s73, s9, 16 ; SI-NEXT: s_lshr_b32 s75, s7, 16 ; SI-NEXT: s_lshr_b32 s45, s5, 16 -; SI-NEXT: v_writelane_b32 v41, s17, 10 ; SI-NEXT: .LBB107_5: ; %end ; SI-NEXT: s_and_b32 s16, s42, 0xffff -; SI-NEXT: s_lshl_b32 s17, s58, 16 +; SI-NEXT: s_lshl_b32 s17, s76, 16 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s43, 0xffff -; SI-NEXT: s_lshl_b32 s18, s23, 16 +; SI-NEXT: s_lshl_b32 s18, s35, 16 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s40, 0xffff -; SI-NEXT: s_lshl_b32 s19, s48, 16 +; SI-NEXT: s_lshl_b32 s19, s78, 16 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s19, s41, 0xffff -; SI-NEXT: s_lshl_b32 s20, s57, 16 +; SI-NEXT: s_lshl_b32 s20, s56, 16 ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s20, s76, 16 +; SI-NEXT: s_lshl_b32 s20, s50, 16 ; SI-NEXT: s_or_b32 s14, s14, s20 ; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_lshl_b32 s20, s59, 16 +; SI-NEXT: s_lshl_b32 s20, s58, 16 ; SI-NEXT: s_or_b32 s15, s15, s20 -; SI-NEXT: v_readlane_b32 s20, v41, 9 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_lshl_b32 s20, s44, 16 ; SI-NEXT: s_or_b32 s12, s12, s20 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_lshl_b32 s20, s61, 16 @@ -81997,7 +81986,6 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_lshl_b32 s20, s45, 16 ; SI-NEXT: s_or_b32 s5, s5, s20 -; SI-NEXT: v_readlane_b32 s21, v41, 10 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll index c167834470e3..83e62cbb9b6f 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -27,7 +27,7 @@ define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocaptu ; CHECK-NEXT: s_addc_u32 s1, s3, s5 ; CHECK-NEXT: s_bfe_u32 s2, s6, 0xd0003 ; CHECK-NEXT: s_add_i32 s2, s2, s7 -; CHECK-NEXT: s_or_b32 s2, s2, 0xc0 +; CHECK-NEXT: s_addk_i32 s2, 0xc0 ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index eff0680fe9a3..5283233a0b46 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -3798,36 +3798,36 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; VI-NEXT: s_min_u32 s3, s3, s7 ; VI-NEXT: s_min_u32 s10, s11, s10 ; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_or_b32 s3, s10, s3 -; VI-NEXT: s_and_b32 s7, s6, 0xffff -; VI-NEXT: s_and_b32 s10, s2, 0xffff +; VI-NEXT: s_or_b32 s10, s10, s3 +; VI-NEXT: s_and_b32 s3, s6, 0xffff +; VI-NEXT: s_and_b32 s7, s2, 0xffff ; VI-NEXT: s_lshr_b32 s6, s6, 16 ; VI-NEXT: s_lshr_b32 s2, s2, 16 ; VI-NEXT: s_min_u32 s2, s2, s6 -; VI-NEXT: s_min_u32 s7, s10, s7 +; VI-NEXT: s_min_u32 s3, s7, s3 ; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_or_b32 s2, s7, s2 -; VI-NEXT: s_and_b32 s6, s5, 0xffff -; VI-NEXT: s_and_b32 s7, s1, 0xffff +; VI-NEXT: s_or_b32 s3, s3, s2 +; VI-NEXT: s_and_b32 s2, s5, 0xffff +; VI-NEXT: s_and_b32 s6, s1, 0xffff ; VI-NEXT: s_lshr_b32 s5, s5, 16 ; VI-NEXT: s_lshr_b32 s1, s1, 16 ; VI-NEXT: s_min_u32 s1, s1, s5 -; VI-NEXT: s_min_u32 s6, s7, s6 +; VI-NEXT: s_min_u32 s2, s6, s2 ; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_or_b32 s1, s6, s1 -; VI-NEXT: s_and_b32 s5, s4, 0xffff -; VI-NEXT: s_and_b32 s6, s0, 0xffff +; VI-NEXT: s_or_b32 s2, s2, s1 +; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: s_and_b32 s5, s0, 0xffff ; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: s_lshr_b32 s0, s0, 16 ; VI-NEXT: s_min_u32 s0, s0, s4 -; VI-NEXT: s_min_u32 s5, s6, s5 +; VI-NEXT: s_min_u32 s1, s5, s1 ; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_or_b32 s0, s5, s0 +; VI-NEXT: s_or_b32 s1, s1, s0 ; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s10 ; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/s_or_b32_transformation.ll b/llvm/test/CodeGen/AMDGPU/s_or_b32_transformation.ll new file mode 100644 index 000000000000..c0ad0c0c64ad --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/s_or_b32_transformation.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s +; This tests if disjoint s_or_b32 gets transformed to s_addk_i32 when we can't use s_bitset1_b32 + +define amdgpu_ps i32 @s_or_b32_i32(i32 inreg %x) { +; CHECK-LABEL: s_or_b32_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_or_b32 s0, s0, 0x101 +; CHECK-NEXT: ; return to shader part epilog + %or = or i32 %x, 257 + ret i32 %or +} + +define amdgpu_ps i32 @s_or_b32_disjoint_to_s_addk_i32(i32 inreg %x) { +; CHECK-LABEL: s_or_b32_disjoint_to_s_addk_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_addk_i32 s0, 0x101 +; CHECK-NEXT: ; return to shader part epilog + %or = or disjoint i32 %x, 257 + ret i32 %or +} + +define amdgpu_ps i32 @s_or_b32_to_s_bitset1_b32(i32 inreg %x) { +; CHECK-LABEL: s_or_b32_to_s_bitset1_b32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_bitset1_b32 s0, 8 +; CHECK-NEXT: ; return to shader part epilog + %or = or disjoint i32 %x, 256 + ret i32 %or +} +