diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 5b32bd0b72a5..14ed778f44f3 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -909,9 +909,21 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { } } + // Shrink scalar logic operations. + if (MI.getOpcode() == AMDGPU::S_AND_B32 || + MI.getOpcode() == AMDGPU::S_OR_B32 || + MI.getOpcode() == AMDGPU::S_XOR_B32) { + ChangeKind CK = shrinkScalarLogicOp(MI); + if (CK == ChangeKind::UpdateHint) + continue; + Changed |= (CK == ChangeKind::UpdateInst); + } + // Try to use S_ADDK_I32 and S_MULK_I32. if (MI.getOpcode() == AMDGPU::S_ADD_I32 || - MI.getOpcode() == AMDGPU::S_MUL_I32) { + MI.getOpcode() == AMDGPU::S_MUL_I32 || + (MI.getOpcode() == AMDGPU::S_OR_B32 && + MI.getFlag(MachineInstr::MIFlag::Disjoint))) { const MachineOperand *Dest = &MI.getOperand(0); MachineOperand *Src0 = &MI.getOperand(1); MachineOperand *Src1 = &MI.getOperand(2); @@ -931,12 +943,11 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); continue; } - if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { if (Src1->isImm() && isKImmOperand(*Src1)) { - unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? - AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; - + unsigned Opc = (MI.getOpcode() == AMDGPU::S_MUL_I32) + ? AMDGPU::S_MULK_I32 + : AMDGPU::S_ADDK_I32; Src1->setImm(SignExtend64(Src1->getImm(), 32)); MI.setDesc(TII->get(Opc)); MI.tieOperands(0, 1); @@ -974,16 +985,6 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { continue; } - // Shrink scalar logic operations. - if (MI.getOpcode() == AMDGPU::S_AND_B32 || - MI.getOpcode() == AMDGPU::S_OR_B32 || - MI.getOpcode() == AMDGPU::S_XOR_B32) { - ChangeKind CK = shrinkScalarLogicOp(MI); - if (CK == ChangeKind::UpdateHint) - continue; - Changed |= (CK == ChangeKind::UpdateInst); - } - if (IsPostRA && TII->isMIMG(MI.getOpcode()) && ST->getGeneration() >= AMDGPUSubtarget::GFX10) { Changed |= shrinkMIMG(MI); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 6ad73601859d..4fefef5e6615 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -664,8 +664,8 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX6-NEXT: s_lshr_b32 s1, s1, 1 ; GFX6-NEXT: s_lshl_b32 s2, s3, s2 ; GFX6-NEXT: s_lshr_b32 s1, s1, s4 -; GFX6-NEXT: s_or_b32 s1, s2, s1 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_or_b32 s2, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xff ; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -954,17 +954,17 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX6-NEXT: s_and_b32 s4, s8, 7 ; GFX6-NEXT: s_andn2_b32 s6, 7, s8 ; GFX6-NEXT: s_lshr_b32 s1, s1, 25 -; GFX6-NEXT: s_and_b32 s2, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s4, s5, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, s6 +; GFX6-NEXT: s_or_b32 s4, s4, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xff ; GFX6-NEXT: s_and_b32 s0, s0, 0xff -; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_or_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s3, 0xff -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s4, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -3689,8 +3689,8 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; GFX8-NEXT: s_lshr_b32 s1, s1, 17 ; GFX8-NEXT: s_lshl_b32 s2, s3, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, s4 -; GFX8-NEXT: s_or_b32 s1, s2, s1 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_or_b32 s2, s2, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -4268,15 +4268,15 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; GFX8-NEXT: s_lshr_b32 s2, s2, 17 ; GFX8-NEXT: s_lshl_b32 s4, s6, s4 ; GFX8-NEXT: s_lshr_b32 s2, s2, s7 -; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s4, s5, 15 +; GFX8-NEXT: s_or_b32 s4, s4, s2 +; GFX8-NEXT: s_and_b32 s2, s5, 15 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_lshr_b32 s3, s3, s5 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_or_b32 s1, s1, s3 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_lshr_b32 s2, s2, s5 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 @@ -4614,29 +4614,29 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX8-NEXT: s_lshr_b32 s2, s2, 17 ; GFX8-NEXT: s_lshl_b32 s4, s6, s4 ; GFX8-NEXT: s_lshr_b32 s2, s2, s8 -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s4, s5, 15 +; GFX8-NEXT: s_or_b32 s4, s4, s2 +; GFX8-NEXT: s_and_b32 s2, s5, 15 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s4, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s9, s5, 16 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 -; GFX8-NEXT: s_lshr_b32 s4, s4, 1 -; GFX8-NEXT: s_lshr_b32 s4, s4, s5 -; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s4, s9, 15 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_lshr_b32 s2, s2, s5 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s9, 15 ; GFX8-NEXT: s_andn2_b32 s5, 15, s9 ; GFX8-NEXT: s_lshr_b32 s3, s3, 17 -; GFX8-NEXT: s_lshl_b32 s4, s7, s4 +; GFX8-NEXT: s_lshl_b32 s2, s7, s2 ; GFX8-NEXT: s_lshr_b32 s3, s3, s5 +; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s3 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 5afab53628c3..bc6a2e7c4325 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -665,8 +665,8 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX6-NEXT: s_lshl_b32 s3, s3, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, s2 -; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_or_b32 s3, s3, s1 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -946,28 +946,28 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX6-NEXT: s_bfe_u32 s7, s1, 0x80008 ; GFX6-NEXT: s_lshr_b32 s2, s7, s2 ; GFX6-NEXT: s_lshr_b32 s6, s1, 24 -; GFX6-NEXT: s_or_b32 s2, s3, s2 -; GFX6-NEXT: s_and_b32 s3, s8, 7 +; GFX6-NEXT: s_or_b32 s3, s3, s2 +; GFX6-NEXT: s_and_b32 s2, s8, 7 ; GFX6-NEXT: s_andn2_b32 s7, 7, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX6-NEXT: s_lshl_b32 s4, s4, s7 -; GFX6-NEXT: s_lshr_b32 s1, s1, s3 -; GFX6-NEXT: s_or_b32 s1, s4, s1 -; GFX6-NEXT: s_and_b32 s3, s9, 7 -; GFX6-NEXT: s_andn2_b32 s4, 7, s9 +; GFX6-NEXT: s_lshr_b32 s1, s1, s2 +; GFX6-NEXT: s_or_b32 s4, s4, s1 +; GFX6-NEXT: s_and_b32 s1, s9, 7 +; GFX6-NEXT: s_andn2_b32 s2, 7, s9 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 -; GFX6-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NEXT: s_lshl_b32 s4, s5, s4 -; GFX6-NEXT: s_lshr_b32 s3, s6, s3 +; GFX6-NEXT: s_lshl_b32 s2, s5, s2 +; GFX6-NEXT: s_lshr_b32 s1, s6, s1 +; GFX6-NEXT: s_or_b32 s2, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: s_and_b32 s0, s0, 0xff -; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NEXT: s_or_b32 s3, s4, s3 -; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s4, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, 0xff +; GFX6-NEXT: s_and_b32 s1, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -3443,8 +3443,8 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_lshl_b32 s2, s3, s2 ; GFX8-NEXT: s_lshr_b32 s1, s4, s1 -; GFX8-NEXT: s_or_b32 s1, s2, s1 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_or_b32 s2, s2, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -4026,15 +4026,15 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; GFX8-NEXT: s_lshl_b32 s6, s6, 1 ; GFX8-NEXT: s_lshl_b32 s4, s6, s4 ; GFX8-NEXT: s_lshr_b32 s2, s7, s2 -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s4, s5, 15 +; GFX8-NEXT: s_or_b32 s4, s4, s2 +; GFX8-NEXT: s_and_b32 s2, s5, 15 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 -; GFX8-NEXT: s_lshr_b32 s3, s3, s4 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_or_b32 s1, s1, s3 +; GFX8-NEXT: s_lshr_b32 s2, s3, s2 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 @@ -4376,8 +4376,8 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX8-NEXT: s_lshl_b32 s6, s6, 1 ; GFX8-NEXT: s_lshl_b32 s4, s6, s4 ; GFX8-NEXT: s_lshr_b32 s2, s7, s2 -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_or_b32 s4, s4, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 6c09950689ef..5a06737d923f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -68693,7 +68693,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v40, s53, 13 ; SI-NEXT: v_writelane_b32 v40, s54, 14 ; SI-NEXT: v_writelane_b32 v40, s55, 15 -; SI-NEXT: s_mov_b32 s92, s16 ; SI-NEXT: v_writelane_b32 v40, s64, 16 ; SI-NEXT: v_writelane_b32 v40, s65, 17 ; SI-NEXT: v_writelane_b32 v40, s66, 18 @@ -68710,52 +68709,54 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: v_writelane_b32 v40, s86, 30 ; SI-NEXT: v_writelane_b32 v40, s87, 31 -; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v40, s96, 32 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v41, s23, 0 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v40, s97, 33 -; SI-NEXT: v_writelane_b32 v41, s21, 1 -; SI-NEXT: v_readfirstlane_b32 s47, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s28, 0 +; SI-NEXT: v_writelane_b32 v41, s26, 1 +; SI-NEXT: v_writelane_b32 v41, s23, 2 +; SI-NEXT: v_writelane_b32 v41, s22, 3 ; SI-NEXT: v_writelane_b32 v40, s98, 34 -; SI-NEXT: v_writelane_b32 v41, s47, 2 +; SI-NEXT: v_writelane_b32 v41, s21, 4 ; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: s_mov_b32 s88, s29 +; SI-NEXT: s_mov_b32 s30, s25 +; SI-NEXT: s_mov_b32 s29, s24 +; SI-NEXT: v_writelane_b32 v41, s20, 5 ; SI-NEXT: v_readfirstlane_b32 s82, v30 -; SI-NEXT: v_readfirstlane_b32 s83, v28 -; SI-NEXT: v_readfirstlane_b32 s44, v27 -; SI-NEXT: v_readfirstlane_b32 s96, v26 -; SI-NEXT: v_readfirstlane_b32 s70, v25 -; SI-NEXT: v_readfirstlane_b32 s68, v24 -; SI-NEXT: v_readfirstlane_b32 s84, v23 +; SI-NEXT: v_readfirstlane_b32 s57, v29 +; SI-NEXT: v_readfirstlane_b32 s34, v28 +; SI-NEXT: v_readfirstlane_b32 s83, v27 +; SI-NEXT: v_readfirstlane_b32 s46, v26 +; SI-NEXT: v_readfirstlane_b32 s68, v25 +; SI-NEXT: v_readfirstlane_b32 s52, v24 +; SI-NEXT: v_readfirstlane_b32 s81, v23 ; SI-NEXT: v_readfirstlane_b32 s65, v22 ; SI-NEXT: v_readfirstlane_b32 s86, v21 -; SI-NEXT: v_readfirstlane_b32 s66, v20 +; SI-NEXT: v_readfirstlane_b32 s84, v20 ; SI-NEXT: v_readfirstlane_b32 s87, v19 ; SI-NEXT: v_readfirstlane_b32 s80, v18 ; SI-NEXT: v_readfirstlane_b32 s36, v17 -; SI-NEXT: v_readfirstlane_b32 s31, v16 +; SI-NEXT: v_readfirstlane_b32 s97, v16 ; SI-NEXT: v_readfirstlane_b32 s64, v15 ; SI-NEXT: v_readfirstlane_b32 s38, v14 ; SI-NEXT: v_readfirstlane_b32 s67, v13 -; SI-NEXT: v_readfirstlane_b32 s34, v12 -; SI-NEXT: v_readfirstlane_b32 s71, v11 -; SI-NEXT: v_readfirstlane_b32 s81, v10 +; SI-NEXT: v_readfirstlane_b32 s48, v12 +; SI-NEXT: v_readfirstlane_b32 s70, v11 +; SI-NEXT: v_readfirstlane_b32 s71, v10 ; SI-NEXT: v_readfirstlane_b32 s37, v9 -; SI-NEXT: v_readfirstlane_b32 s35, v8 -; SI-NEXT: v_readfirstlane_b32 s49, v7 +; SI-NEXT: v_readfirstlane_b32 s28, v8 +; SI-NEXT: v_readfirstlane_b32 s93, v7 ; SI-NEXT: v_readfirstlane_b32 s94, v6 -; SI-NEXT: v_readfirstlane_b32 s51, v5 -; SI-NEXT: v_readfirstlane_b32 s88, v4 -; SI-NEXT: v_readfirstlane_b32 s53, v3 -; SI-NEXT: v_readfirstlane_b32 s54, v2 -; SI-NEXT: v_readfirstlane_b32 s89, v1 -; SI-NEXT: v_readfirstlane_b32 s90, v0 +; SI-NEXT: v_readfirstlane_b32 s49, v5 +; SI-NEXT: v_readfirstlane_b32 s95, v4 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s91, v31 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s16, v32 +; SI-NEXT: v_readfirstlane_b32 s89, v32 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s93, v33 +; SI-NEXT: v_readfirstlane_b32 s79, v33 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 @@ -68765,245 +68766,248 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s52, v34 +; SI-NEXT: v_readfirstlane_b32 s39, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s55, v35 +; SI-NEXT: v_readfirstlane_b32 s66, v35 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s79, v37 +; SI-NEXT: v_readfirstlane_b32 s96, v37 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 -; SI-NEXT: v_readfirstlane_b32 s50, v36 +; SI-NEXT: v_readfirstlane_b32 s55, v36 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s21, v38 +; SI-NEXT: v_readfirstlane_b32 s47, v38 +; SI-NEXT: v_readfirstlane_b32 s53, v3 +; SI-NEXT: v_readfirstlane_b32 s92, v2 +; SI-NEXT: v_readfirstlane_b32 s20, v1 +; SI-NEXT: v_readfirstlane_b32 s54, v0 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s56, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s85, v32 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s58, v33 +; SI-NEXT: v_readfirstlane_b32 s21, v33 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s98, v39 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s46, v48 +; SI-NEXT: v_readfirstlane_b32 s26, v48 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s99, v49 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s97, v50 +; SI-NEXT: v_readfirstlane_b32 s24, v50 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s9, v51 -; SI-NEXT: v_writelane_b32 v41, s58, 3 -; SI-NEXT: v_writelane_b32 v41, s9, 4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 +; SI-NEXT: v_readfirstlane_b32 s23, v51 +; SI-NEXT: v_writelane_b32 v41, s21, 6 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: v_readfirstlane_b32 s22, v34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s69, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s30, v36 +; SI-NEXT: v_readfirstlane_b32 s25, v36 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB99_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s12, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s19, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s24, 0xff -; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_and_b32 s5, s29, 0xff +; SI-NEXT: s_lshl_b32 s6, s30, 8 ; SI-NEXT: s_or_b32 s13, s5, s6 -; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_readlane_b32 s5, v41, 1 +; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s27, 24 ; SI-NEXT: s_or_b32 s6, s6, s5 -; SI-NEXT: s_and_b32 s5, s54, 0xff +; SI-NEXT: s_and_b32 s5, s92, 0xff ; SI-NEXT: s_lshl_b32 s7, s53, 8 ; SI-NEXT: s_or_b32 s14, s5, s7 -; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: s_and_b32 s5, s95, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s51, 24 +; SI-NEXT: s_lshl_b32 s7, s49, 24 ; SI-NEXT: s_or_b32 s8, s7, s5 -; SI-NEXT: s_and_b32 s5, s81, 0xff -; SI-NEXT: s_lshl_b32 s7, s71, 8 +; SI-NEXT: s_and_b32 s5, s71, 0xff +; SI-NEXT: s_lshl_b32 s7, s70, 8 ; SI-NEXT: s_or_b32 s15, s5, s7 -; SI-NEXT: s_and_b32 s5, s34, 0xff +; SI-NEXT: s_and_b32 s5, s48, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s7, s67, 24 ; SI-NEXT: s_or_b32 s10, s7, s5 ; SI-NEXT: s_and_b32 s5, s80, 0xff ; SI-NEXT: s_lshl_b32 s7, s87, 8 ; SI-NEXT: s_or_b32 s40, s5, s7 -; SI-NEXT: s_and_b32 s5, s66, 0xff +; SI-NEXT: s_and_b32 s5, s84, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s7, s86, 24 ; SI-NEXT: s_or_b32 s60, s7, s5 -; SI-NEXT: s_and_b32 s5, s96, 0xff -; SI-NEXT: s_lshl_b32 s7, s44, 8 +; SI-NEXT: s_and_b32 s5, s46, 0xff +; SI-NEXT: s_lshl_b32 s7, s83, 8 ; SI-NEXT: s_or_b32 s41, s5, s7 -; SI-NEXT: s_and_b32 s5, s9, 0xff -; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_and_b32 s5, s23, 0xff +; SI-NEXT: s_lshl_b32 s7, s24, 8 ; SI-NEXT: s_or_b32 s42, s5, s7 -; SI-NEXT: s_and_b32 s5, s21, 0xff -; SI-NEXT: s_lshl_b32 s7, s79, 8 +; SI-NEXT: s_and_b32 s5, s47, 0xff +; SI-NEXT: s_lshl_b32 s7, s96, 8 ; SI-NEXT: s_or_b32 s43, s5, s7 -; SI-NEXT: v_readlane_b32 s7, v41, 1 -; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: v_readlane_b32 s5, v41, 5 +; SI-NEXT: v_readlane_b32 s7, v41, 4 +; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_or_b32 s5, s5, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: v_readlane_b32 s9, v41, 0 +; SI-NEXT: v_readlane_b32 s7, v41, 3 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: v_readlane_b32 s9, v41, 2 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s9, s9, 24 -; SI-NEXT: s_or_b32 s57, s9, s7 -; SI-NEXT: s_and_b32 s7, s28, 0xff -; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: v_writelane_b32 v41, s46, 9 +; SI-NEXT: s_mov_b32 s56, s30 +; SI-NEXT: s_or_b32 s30, s9, s7 +; SI-NEXT: v_readlane_b32 s7, v41, 0 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_lshl_b32 s9, s88, 8 ; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s90, 0xff +; SI-NEXT: s_and_b32 s9, s54, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s89, 24 -; SI-NEXT: s_or_b32 s77, s11, s9 +; SI-NEXT: s_lshl_b32 s11, s20, 24 +; SI-NEXT: s_or_b32 s90, s11, s9 ; SI-NEXT: s_and_b32 s9, s94, 0xff -; SI-NEXT: s_lshl_b32 s11, s49, 8 +; SI-NEXT: s_lshl_b32 s11, s93, 8 ; SI-NEXT: s_or_b32 s9, s9, s11 -; SI-NEXT: s_and_b32 s11, s35, 0xff +; SI-NEXT: s_and_b32 s11, s28, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: v_writelane_b32 v41, s44, 11 ; SI-NEXT: s_lshl_b32 s44, s37, 24 ; SI-NEXT: s_or_b32 vcc_lo, s44, s11 ; SI-NEXT: s_and_b32 s11, s38, 0xff ; SI-NEXT: s_lshl_b32 s44, s64, 8 ; SI-NEXT: s_or_b32 s11, s11, s44 -; SI-NEXT: s_and_b32 s44, s31, 0xff +; SI-NEXT: s_and_b32 s44, s97, 0xff ; SI-NEXT: s_lshl_b32 s44, s44, 16 ; SI-NEXT: s_lshl_b32 s45, s36, 24 +; SI-NEXT: v_writelane_b32 v41, s23, 10 ; SI-NEXT: s_or_b32 vcc_hi, s45, s44 ; SI-NEXT: s_and_b32 s44, s65, 0xff -; SI-NEXT: s_lshl_b32 s45, s84, 8 +; SI-NEXT: s_lshl_b32 s45, s81, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: s_and_b32 s45, s68, 0xff +; SI-NEXT: s_and_b32 s45, s52, 0xff +; SI-NEXT: v_writelane_b32 v41, s92, 11 ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_mov_b32 s23, s21 -; SI-NEXT: s_mov_b32 s21, s46 -; SI-NEXT: s_lshl_b32 s46, s70, 24 +; SI-NEXT: s_lshl_b32 s46, s68, 24 +; SI-NEXT: v_writelane_b32 v41, s93, 12 ; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: v_writelane_b32 v41, s97, 12 -; SI-NEXT: s_mov_b32 s97, s86 -; SI-NEXT: s_mov_b32 s86, s84 -; SI-NEXT: s_mov_b32 s84, s70 -; SI-NEXT: s_mov_b32 s70, s34 -; SI-NEXT: s_mov_b32 s34, s88 -; SI-NEXT: s_mov_b32 s88, s24 -; SI-NEXT: s_or_b32 s24, s46, s45 -; SI-NEXT: s_or_b32 s61, s44, s24 +; SI-NEXT: v_writelane_b32 v41, s20, 13 +; SI-NEXT: s_or_b32 s20, s46, s45 +; SI-NEXT: s_or_b32 s61, s44, s20 ; SI-NEXT: s_and_b32 s44, s82, 0xff -; SI-NEXT: s_lshl_b32 s45, s30, 8 +; SI-NEXT: s_lshl_b32 s45, s25, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: s_and_b32 s45, s69, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s78, 24 -; SI-NEXT: s_mov_b32 s95, s90 -; SI-NEXT: s_mov_b32 s90, s18 +; SI-NEXT: s_lshl_b32 s46, s22, 24 +; SI-NEXT: v_writelane_b32 v41, s96, 14 +; SI-NEXT: s_mov_b32 s96, s84 +; SI-NEXT: s_mov_b32 s84, s67 +; SI-NEXT: s_mov_b32 s67, s49 +; SI-NEXT: s_mov_b32 s49, s18 ; SI-NEXT: s_or_b32 s18, s46, s45 -; SI-NEXT: s_and_b32 s45, s83, 0xff +; SI-NEXT: s_and_b32 s45, s34, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s47, 24 +; SI-NEXT: s_lshl_b32 s46, s57, 24 ; SI-NEXT: s_and_b32 s44, s44, 0xffff ; SI-NEXT: s_or_b32 s62, s46, s45 ; SI-NEXT: s_or_b32 s63, s44, s18 ; SI-NEXT: s_and_b32 s44, s98, 0xff -; SI-NEXT: s_lshl_b32 s45, s58, 8 +; SI-NEXT: s_lshl_b32 s45, s21, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: s_and_b32 s45, s85, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s56, 24 -; SI-NEXT: s_mov_b32 s76, s56 -; SI-NEXT: s_mov_b32 s56, s85 -; SI-NEXT: s_mov_b32 s85, s79 -; SI-NEXT: s_mov_b32 s79, s19 +; SI-NEXT: s_lshl_b32 s46, s59, 24 +; SI-NEXT: s_mov_b32 s23, s88 +; SI-NEXT: s_mov_b32 s88, s19 ; SI-NEXT: s_or_b32 s19, s46, s45 ; SI-NEXT: s_and_b32 s45, s99, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s21, 24 +; SI-NEXT: s_lshl_b32 s46, s26, 24 ; SI-NEXT: s_and_b32 s44, s44, 0xffff ; SI-NEXT: s_or_b32 s72, s46, s45 ; SI-NEXT: s_or_b32 s73, s44, s19 -; SI-NEXT: s_and_b32 s44, s52, 0xff -; SI-NEXT: s_lshl_b32 s45, s93, 8 +; SI-NEXT: s_and_b32 s44, s39, 0xff +; SI-NEXT: s_lshl_b32 s45, s79, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: s_and_b32 s45, s16, 0xff +; SI-NEXT: s_and_b32 s45, s89, 0xff +; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_lshl_b32 s45, s45, 16 ; SI-NEXT: s_lshl_b32 s46, s91, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_mov_b32 s47, s96 -; SI-NEXT: s_mov_b32 s96, s78 -; SI-NEXT: s_mov_b32 s78, s69 -; SI-NEXT: s_mov_b32 s69, s68 -; SI-NEXT: s_mov_b32 s68, s38 -; SI-NEXT: s_mov_b32 s38, s35 -; SI-NEXT: s_mov_b32 s35, s89 -; SI-NEXT: s_or_b32 s89, s46, s45 -; SI-NEXT: s_and_b32 s45, s50, 0xff -; SI-NEXT: s_or_b32 s5, s5, s57 -; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s55, 24 +; SI-NEXT: s_or_b32 s5, s5, s30 +; SI-NEXT: s_mov_b32 s77, s47 +; SI-NEXT: s_mov_b32 s47, s24 +; SI-NEXT: s_mov_b32 s24, s83 +; SI-NEXT: s_mov_b32 s83, s70 +; SI-NEXT: s_mov_b32 s70, s38 +; SI-NEXT: s_mov_b32 s38, s91 +; SI-NEXT: s_mov_b32 s91, s23 +; SI-NEXT: s_mov_b32 s23, s17 +; SI-NEXT: s_or_b32 s17, s46, s45 +; SI-NEXT: s_and_b32 s45, s55, 0xff ; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_or_b32 s74, s46, s45 -; SI-NEXT: s_mov_b32 s45, s83 -; SI-NEXT: s_mov_b32 s83, s91 -; SI-NEXT: s_mov_b32 s91, s28 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_mov_b32 s76, s59 +; SI-NEXT: s_mov_b32 s59, s57 +; SI-NEXT: s_mov_b32 s57, s34 +; SI-NEXT: s_mov_b32 s34, s82 +; SI-NEXT: s_mov_b32 s82, s52 +; SI-NEXT: s_mov_b32 s52, s95 +; SI-NEXT: s_mov_b32 s95, s27 +; SI-NEXT: s_mov_b32 s21, s22 +; SI-NEXT: s_mov_b32 s22, s69 +; SI-NEXT: s_mov_b32 s69, s48 +; SI-NEXT: s_mov_b32 s48, s37 +; SI-NEXT: s_mov_b32 s37, s56 +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s66, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_mov_b32 s56, s98 +; SI-NEXT: s_mov_b32 s98, s28 ; SI-NEXT: s_and_b32 s28, s42, 0xffff -; SI-NEXT: s_mov_b32 s59, s94 -; SI-NEXT: s_mov_b32 s94, s27 ; SI-NEXT: s_and_b32 s27, s43, 0xffff ; SI-NEXT: s_or_b32 s42, s12, s4 ; SI-NEXT: s_mov_b32 s43, s5 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: s_or_b32 s7, s7, s90 ; SI-NEXT: s_or_b32 s9, s9, vcc_lo -; SI-NEXT: v_writelane_b32 v41, s4, 5 -; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: v_writelane_b32 v41, s5, 6 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 16 ; SI-NEXT: s_or_b32 s11, s11, vcc_hi -; SI-NEXT: v_writelane_b32 v41, s4, 7 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: v_writelane_b32 v41, s5, 8 -; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 16 -; SI-NEXT: s_or_b32 s7, s7, s77 -; SI-NEXT: s_or_b32 s75, s44, s89 +; SI-NEXT: s_or_b32 s74, s46, s45 +; SI-NEXT: s_or_b32 s75, s44, s17 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_and_b32 s58, s15, 0xffff -; SI-NEXT: s_mov_b32 s44, s82 -; SI-NEXT: s_mov_b32 s82, s81 -; SI-NEXT: s_mov_b32 s81, s55 -; SI-NEXT: s_mov_b32 s55, s54 -; SI-NEXT: s_mov_b32 s54, s51 -; SI-NEXT: s_mov_b32 s51, s37 -; SI-NEXT: s_mov_b32 s37, s16 +; SI-NEXT: s_mov_b32 s92, s16 ; SI-NEXT: s_and_b32 s16, s40, 0xffff -; SI-NEXT: s_mov_b32 s46, s98 -; SI-NEXT: s_mov_b32 s98, s93 -; SI-NEXT: s_and_b32 s93, s41, 0xffff -; SI-NEXT: v_writelane_b32 v41, s4, 9 -; SI-NEXT: s_mov_b32 s39, s49 +; SI-NEXT: s_mov_b32 s31, s29 +; SI-NEXT: s_and_b32 s29, s41, 0xffff +; SI-NEXT: v_writelane_b32 v41, s4, 7 +; SI-NEXT: s_mov_b32 s93, s39 +; SI-NEXT: s_mov_b32 s39, s79 +; SI-NEXT: v_writelane_b32 v41, s5, 8 ; SI-NEXT: s_or_b32 s40, s13, s6 ; SI-NEXT: s_mov_b32 s41, s7 -; SI-NEXT: s_lshr_b64 s[48:49], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 16 ; SI-NEXT: s_or_b32 s14, s14, s8 ; SI-NEXT: s_mov_b32 s15, s9 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 16 ; SI-NEXT: s_or_b32 s12, s58, s10 ; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: v_writelane_b32 v41, s5, 10 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 ; SI-NEXT: s_or_b32 s10, s16, s60 ; SI-NEXT: s_mov_b32 s11, s61 ; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 -; SI-NEXT: s_or_b32 s8, s93, s62 +; SI-NEXT: s_or_b32 s8, s29, s62 ; SI-NEXT: s_mov_b32 s9, s63 ; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 ; SI-NEXT: s_or_b32 s6, s28, s72 @@ -69012,68 +69016,68 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s4, s27, s74 ; SI-NEXT: s_mov_b32 s5, s75 ; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 -; SI-NEXT: s_mov_b32 s16, s37 -; SI-NEXT: s_mov_b32 s37, s51 -; SI-NEXT: s_mov_b32 s51, s54 -; SI-NEXT: s_mov_b32 s54, s55 -; SI-NEXT: s_mov_b32 s55, s81 -; SI-NEXT: s_mov_b32 s81, s82 -; SI-NEXT: s_mov_b32 s82, s44 -; SI-NEXT: v_readlane_b32 s44, v41, 11 -; SI-NEXT: s_mov_b32 s93, s98 -; SI-NEXT: s_mov_b32 s98, s46 -; SI-NEXT: s_mov_b32 s46, s21 -; SI-NEXT: s_mov_b32 s21, s23 -; SI-NEXT: s_mov_b32 s28, s91 -; SI-NEXT: s_mov_b32 s91, s83 -; SI-NEXT: s_mov_b32 s83, s45 -; SI-NEXT: s_mov_b32 s27, s94 -; SI-NEXT: s_mov_b32 s94, s59 -; SI-NEXT: s_lshr_b32 s23, s57, 16 -; SI-NEXT: s_lshr_b32 s57, s77, 16 -; SI-NEXT: s_lshr_b32 s59, vcc_lo, 16 +; SI-NEXT: s_mov_b32 s16, s92 +; SI-NEXT: s_mov_b32 s79, s39 +; SI-NEXT: s_mov_b32 s39, s93 +; SI-NEXT: s_mov_b32 s29, s31 +; SI-NEXT: s_mov_b32 s28, s98 +; SI-NEXT: s_mov_b32 s98, s56 +; SI-NEXT: s_lshr_b32 s35, s30, 16 +; SI-NEXT: v_readlane_b32 s46, v41, 9 +; SI-NEXT: s_lshr_b32 s56, s90, 16 +; SI-NEXT: s_lshr_b32 s58, vcc_lo, 16 ; SI-NEXT: s_lshr_b32 s61, vcc_hi, 16 -; SI-NEXT: s_lshr_b32 s63, s24, 16 -; SI-NEXT: s_mov_b32 s24, s88 -; SI-NEXT: s_mov_b32 s88, s34 -; SI-NEXT: s_mov_b32 s34, s70 -; SI-NEXT: s_mov_b32 s70, s84 -; SI-NEXT: s_mov_b32 s84, s86 -; SI-NEXT: s_mov_b32 s86, s97 -; SI-NEXT: v_readlane_b32 s97, v41, 12 +; SI-NEXT: s_lshr_b32 s63, s20, 16 +; SI-NEXT: v_readlane_b32 s20, v41, 13 +; SI-NEXT: v_readlane_b32 s93, v41, 12 +; SI-NEXT: v_readlane_b32 s92, v41, 11 ; SI-NEXT: s_lshr_b32 s73, s18, 16 -; SI-NEXT: s_mov_b32 s18, s90 -; SI-NEXT: s_mov_b32 s90, s95 -; SI-NEXT: s_mov_b32 s49, s39 +; SI-NEXT: s_mov_b32 s18, s49 +; SI-NEXT: s_mov_b32 s49, s67 +; SI-NEXT: s_mov_b32 s67, s84 +; SI-NEXT: s_mov_b32 s84, s96 +; SI-NEXT: v_readlane_b32 s96, v41, 14 ; SI-NEXT: s_lshr_b32 s75, s19, 16 -; SI-NEXT: s_mov_b32 s19, s79 -; SI-NEXT: s_mov_b32 s79, s85 -; SI-NEXT: s_mov_b32 s85, s56 -; SI-NEXT: s_mov_b32 s56, s76 -; SI-NEXT: s_lshr_b32 s45, s89, 16 -; SI-NEXT: s_mov_b32 s89, s35 -; SI-NEXT: s_mov_b32 s35, s38 -; SI-NEXT: s_mov_b32 s38, s68 -; SI-NEXT: s_mov_b32 s68, s69 -; SI-NEXT: s_mov_b32 s69, s78 -; SI-NEXT: s_mov_b32 s78, s96 -; SI-NEXT: s_mov_b32 s96, s47 +; SI-NEXT: s_mov_b32 s19, s88 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_mov_b32 s17, s23 +; SI-NEXT: v_readlane_b32 s23, v41, 10 +; SI-NEXT: s_mov_b32 s30, s37 +; SI-NEXT: s_mov_b32 s37, s48 +; SI-NEXT: s_mov_b32 s48, s69 +; SI-NEXT: s_mov_b32 s69, s22 +; SI-NEXT: s_mov_b32 s22, s21 +; SI-NEXT: s_mov_b32 s27, s95 +; SI-NEXT: s_mov_b32 s95, s52 +; SI-NEXT: s_mov_b32 s52, s82 +; SI-NEXT: s_mov_b32 s82, s34 +; SI-NEXT: s_mov_b32 s88, s91 +; SI-NEXT: s_mov_b32 s91, s38 +; SI-NEXT: s_mov_b32 s38, s70 +; SI-NEXT: s_mov_b32 s70, s83 +; SI-NEXT: s_mov_b32 s83, s24 +; SI-NEXT: s_mov_b32 s24, s47 +; SI-NEXT: s_mov_b32 s47, s77 +; SI-NEXT: s_mov_b32 s34, s57 +; SI-NEXT: s_mov_b32 s57, s59 +; SI-NEXT: s_mov_b32 s59, s76 ; SI-NEXT: s_mov_b64 s[76:77], 0 ; SI-NEXT: s_branch .LBB99_3 ; SI-NEXT: .LBB99_2: ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_mov_b64 s[76:77], -1 -; SI-NEXT: v_writelane_b32 v41, s4, 5 -; SI-NEXT: v_writelane_b32 v41, s5, 6 -; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 7 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: v_writelane_b32 v41, s5, 8 +; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr60 @@ -69084,39 +69088,31 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: v_writelane_b32 v41, s4, 7 -; SI-NEXT: v_writelane_b32 v41, s5, 8 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 9 -; SI-NEXT: v_writelane_b32 v41, s5, 10 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: .LBB99_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[76:77] -; SI-NEXT: v_readlane_b32 s76, v41, 5 -; SI-NEXT: v_readlane_b32 s77, v41, 6 -; SI-NEXT: s_mov_b32 s58, s76 ; SI-NEXT: v_readlane_b32 s76, v41, 7 ; SI-NEXT: v_readlane_b32 s77, v41, 8 ; SI-NEXT: s_cbranch_vccnz .LBB99_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s21, s47, 3 ; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s79, 8 -; SI-NEXT: s_add_i32 s50, s50, 3 +; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_add_i32 s50, s55, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s50, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s55, 24 +; SI-NEXT: s_lshl_b32 s5, s66, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s39, s52, 3 +; SI-NEXT: s_add_i32 s39, s39, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s39, 0xff -; SI-NEXT: s_lshl_b32 s6, s93, 8 -; SI-NEXT: s_add_i32 s79, s16, 3 +; SI-NEXT: s_lshl_b32 s6, s79, 8 +; SI-NEXT: s_add_i32 s79, s89, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s7, s79, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 @@ -69124,21 +69120,20 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s6, v41, 4 -; SI-NEXT: s_add_i32 s23, s6, 3 ; SI-NEXT: s_and_b32 s6, s23, 0xff -; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_lshl_b32 s7, s24, 8 ; SI-NEXT: s_add_i32 s99, s99, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s8, s99, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s46, 24 +; SI-NEXT: s_lshl_b32 s7, s26, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_add_i32 s98, s98, 3 -; SI-NEXT: v_readlane_b32 s8, v41, 3 +; SI-NEXT: v_readlane_b32 s8, v41, 6 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s98, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 8 @@ -69146,32 +69141,31 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s9, s85, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_lshl_b32 s8, s56, 24 +; SI-NEXT: s_lshl_b32 s8, s59, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_add_i32 s96, s96, 3 +; SI-NEXT: s_add_i32 s96, s46, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s96, 0xff -; SI-NEXT: s_lshl_b32 s9, s44, 8 -; SI-NEXT: s_add_i32 s83, s83, 3 +; SI-NEXT: s_lshl_b32 s9, s83, 8 +; SI-NEXT: s_add_i32 s83, s34, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v41, 2 ; SI-NEXT: s_and_b32 s10, s83, 0xff ; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s9, s57, 24 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_add_i32 s82, s82, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_and_b32 s9, s82, 0xff -; SI-NEXT: s_lshl_b32 s10, s30, 8 +; SI-NEXT: s_lshl_b32 s10, s25, 8 ; SI-NEXT: s_add_i32 s69, s69, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s11, s69, 0xff ; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: s_lshl_b32 s10, s78, 24 +; SI-NEXT: s_lshl_b32 s10, s22, 24 ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s10, s10, s11 @@ -69179,7 +69173,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s80, 0xff ; SI-NEXT: s_lshl_b32 s11, s87, 8 -; SI-NEXT: s_add_i32 s66, s66, 3 +; SI-NEXT: s_add_i32 s66, s84, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_and_b32 s12, s66, 0xff ; SI-NEXT: s_addk_i32 s10, 0x300 @@ -69190,20 +69184,20 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s65, s65, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_and_b32 s11, s65, 0xff -; SI-NEXT: s_lshl_b32 s12, s84, 8 -; SI-NEXT: s_add_i32 s52, s68, 3 +; SI-NEXT: s_lshl_b32 s12, s81, 8 +; SI-NEXT: s_add_i32 s52, s52, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_and_b32 s13, s52, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 -; SI-NEXT: s_lshl_b32 s12, s70, 24 +; SI-NEXT: s_lshl_b32 s12, s68, 24 ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: s_add_i32 s55, s81, 3 +; SI-NEXT: s_add_i32 s55, s71, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_and_b32 s12, s55, 0xff -; SI-NEXT: s_lshl_b32 s13, s71, 8 -; SI-NEXT: s_add_i32 s48, s34, 3 +; SI-NEXT: s_lshl_b32 s13, s70, 8 +; SI-NEXT: s_add_i32 s48, s48, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: s_and_b32 s14, s48, 0xff ; SI-NEXT: s_addk_i32 s12, 0x300 @@ -69215,7 +69209,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: s_and_b32 s13, s38, 0xff ; SI-NEXT: s_lshl_b32 s14, s64, 8 -; SI-NEXT: s_add_i32 s31, s31, 3 +; SI-NEXT: s_add_i32 s31, s97, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_and_b32 s15, s31, 0xff ; SI-NEXT: s_addk_i32 s13, 0x300 @@ -69223,84 +69217,85 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: s_add_i32 s36, s54, 3 +; SI-NEXT: s_add_i32 s36, s92, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_and_b32 s14, s36, 0xff ; SI-NEXT: s_lshl_b32 s15, s53, 8 -; SI-NEXT: s_add_i32 s95, s88, 3 +; SI-NEXT: s_add_i32 s95, s95, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s21, s95, 0xff ; SI-NEXT: s_addk_i32 s14, 0x300 -; SI-NEXT: s_lshl_b32 s15, s51, 24 +; SI-NEXT: s_lshl_b32 s15, s49, 24 ; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s15, s15, s21 ; SI-NEXT: s_add_i32 s94, s94, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s15, s94, 0xff -; SI-NEXT: s_lshl_b32 s21, s49, 8 -; SI-NEXT: s_add_i32 s91, s35, 3 +; SI-NEXT: s_lshl_b32 s21, s93, 8 +; SI-NEXT: s_add_i32 s91, s28, 3 ; SI-NEXT: s_or_b32 s15, s21, s15 -; SI-NEXT: s_and_b32 s16, s91, 0xff +; SI-NEXT: s_and_b32 s22, s91, 0xff ; SI-NEXT: s_addk_i32 s15, 0x300 ; SI-NEXT: s_lshl_b32 s21, s37, 24 -; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_or_b32 s21, s21, s16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_add_i32 s24, s29, 3 +; SI-NEXT: v_readlane_b32 s23, v41, 1 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s15, s21, s15 ; SI-NEXT: s_and_b32 s21, s24, 0xff -; SI-NEXT: s_lshl_b32 s16, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s21, s16, s21 -; SI-NEXT: s_and_b32 s23, s26, 0xff -; SI-NEXT: s_addk_i32 s21, 0x300 -; SI-NEXT: s_lshl_b32 s16, s27, 24 -; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s23 -; SI-NEXT: s_or_b32 s21, s16, s21 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s40, s21, 0x3000000 -; SI-NEXT: s_and_b32 s21, s28, 0xff -; SI-NEXT: s_lshl_b32 s16, s29, 8 -; SI-NEXT: s_lshl_b32 s23, s89, 24 -; SI-NEXT: s_add_i32 s89, s90, 3 -; SI-NEXT: s_or_b32 s21, s16, s21 -; SI-NEXT: s_and_b32 s16, s89, 0xff -; SI-NEXT: s_addk_i32 s21, 0x300 -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s23, s16 -; SI-NEXT: s_or_b32 s16, s16, s21 -; SI-NEXT: s_add_i32 s41, s16, 0x3000000 -; SI-NEXT: s_add_i32 s16, s92, 3 +; SI-NEXT: s_lshl_b32 s22, s30, 8 +; SI-NEXT: s_add_i32 s26, s23, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_and_b32 s23, s26, 0xff ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s22, s27, 24 +; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s19, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s21, s22, s21 ; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s40, s21, 0x3000000 +; SI-NEXT: v_readlane_b32 s21, v41, 0 ; SI-NEXT: s_add_i32 s42, s16, 0x3000000 -; SI-NEXT: s_add_i32 s16, s20, 3 -; SI-NEXT: v_readlane_b32 s17, v41, 1 +; SI-NEXT: v_readlane_b32 s16, v41, 5 +; SI-NEXT: s_add_i32 s28, s21, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 4 +; SI-NEXT: v_readlane_b32 s18, v41, 3 +; SI-NEXT: s_and_b32 s21, s28, 0xff +; SI-NEXT: s_lshl_b32 s22, s88, 8 +; SI-NEXT: s_add_i32 s89, s54, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s22, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_and_b32 s22, s89, 0xff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v41, 0 +; SI-NEXT: v_readlane_b32 s17, v41, 2 ; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s20, s20, 24 +; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 @@ -69311,49 +69306,49 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s11, s11, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 0x3000000 -; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s14, s14, 0x3000000 ; SI-NEXT: s_add_i32 s15, s15, 0x3000000 -; SI-NEXT: s_lshr_b64 s[58:59], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: s_add_i32 s41, s20, 0x3000000 +; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[48:49], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 9 -; SI-NEXT: s_lshr_b32 s23, s43, 16 -; SI-NEXT: s_lshr_b32 s57, s41, 16 -; SI-NEXT: s_lshr_b32 s59, s15, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[50:51], s[14:15], 16 +; SI-NEXT: s_lshr_b32 s35, s43, 16 +; SI-NEXT: s_lshr_b32 s56, s41, 16 +; SI-NEXT: s_lshr_b32 s58, s15, 16 ; SI-NEXT: s_lshr_b32 s61, s13, 16 ; SI-NEXT: s_lshr_b32 s63, s11, 16 ; SI-NEXT: s_lshr_b32 s73, s9, 16 ; SI-NEXT: s_lshr_b32 s75, s7, 16 ; SI-NEXT: s_lshr_b32 s45, s5, 16 -; SI-NEXT: v_writelane_b32 v41, s17, 10 ; SI-NEXT: .LBB99_5: ; %end ; SI-NEXT: s_and_b32 s16, s42, 0xffff -; SI-NEXT: s_lshl_b32 s17, s58, 16 +; SI-NEXT: s_lshl_b32 s17, s76, 16 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s43, 0xffff -; SI-NEXT: s_lshl_b32 s18, s23, 16 +; SI-NEXT: s_lshl_b32 s18, s35, 16 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s40, 0xffff -; SI-NEXT: s_lshl_b32 s19, s48, 16 +; SI-NEXT: s_lshl_b32 s19, s78, 16 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s19, s41, 0xffff -; SI-NEXT: s_lshl_b32 s20, s57, 16 +; SI-NEXT: s_lshl_b32 s20, s56, 16 ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s20, s76, 16 +; SI-NEXT: s_lshl_b32 s20, s50, 16 ; SI-NEXT: s_or_b32 s14, s14, s20 ; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_lshl_b32 s20, s59, 16 +; SI-NEXT: s_lshl_b32 s20, s58, 16 ; SI-NEXT: s_or_b32 s15, s15, s20 -; SI-NEXT: v_readlane_b32 s20, v41, 9 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_lshl_b32 s20, s44, 16 ; SI-NEXT: s_or_b32 s12, s12, s20 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_lshl_b32 s20, s61, 16 @@ -69382,7 +69377,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_lshl_b32 s20, s45, 16 ; SI-NEXT: s_or_b32 s5, s5, s20 -; SI-NEXT: v_readlane_b32 s21, v41, 10 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 @@ -81308,7 +81302,6 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v40, s53, 13 ; SI-NEXT: v_writelane_b32 v40, s54, 14 ; SI-NEXT: v_writelane_b32 v40, s55, 15 -; SI-NEXT: s_mov_b32 s92, s16 ; SI-NEXT: v_writelane_b32 v40, s64, 16 ; SI-NEXT: v_writelane_b32 v40, s65, 17 ; SI-NEXT: v_writelane_b32 v40, s66, 18 @@ -81325,52 +81318,54 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: v_writelane_b32 v40, s86, 30 ; SI-NEXT: v_writelane_b32 v40, s87, 31 -; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v40, s96, 32 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v41, s23, 0 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v40, s97, 33 -; SI-NEXT: v_writelane_b32 v41, s21, 1 -; SI-NEXT: v_readfirstlane_b32 s47, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s28, 0 +; SI-NEXT: v_writelane_b32 v41, s26, 1 +; SI-NEXT: v_writelane_b32 v41, s23, 2 +; SI-NEXT: v_writelane_b32 v41, s22, 3 ; SI-NEXT: v_writelane_b32 v40, s98, 34 -; SI-NEXT: v_writelane_b32 v41, s47, 2 +; SI-NEXT: v_writelane_b32 v41, s21, 4 ; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: s_mov_b32 s88, s29 +; SI-NEXT: s_mov_b32 s30, s25 +; SI-NEXT: s_mov_b32 s29, s24 +; SI-NEXT: v_writelane_b32 v41, s20, 5 ; SI-NEXT: v_readfirstlane_b32 s82, v30 -; SI-NEXT: v_readfirstlane_b32 s83, v28 -; SI-NEXT: v_readfirstlane_b32 s44, v27 -; SI-NEXT: v_readfirstlane_b32 s96, v26 -; SI-NEXT: v_readfirstlane_b32 s70, v25 -; SI-NEXT: v_readfirstlane_b32 s68, v24 -; SI-NEXT: v_readfirstlane_b32 s84, v23 +; SI-NEXT: v_readfirstlane_b32 s57, v29 +; SI-NEXT: v_readfirstlane_b32 s34, v28 +; SI-NEXT: v_readfirstlane_b32 s83, v27 +; SI-NEXT: v_readfirstlane_b32 s46, v26 +; SI-NEXT: v_readfirstlane_b32 s68, v25 +; SI-NEXT: v_readfirstlane_b32 s52, v24 +; SI-NEXT: v_readfirstlane_b32 s81, v23 ; SI-NEXT: v_readfirstlane_b32 s65, v22 ; SI-NEXT: v_readfirstlane_b32 s86, v21 -; SI-NEXT: v_readfirstlane_b32 s66, v20 +; SI-NEXT: v_readfirstlane_b32 s84, v20 ; SI-NEXT: v_readfirstlane_b32 s87, v19 ; SI-NEXT: v_readfirstlane_b32 s80, v18 ; SI-NEXT: v_readfirstlane_b32 s36, v17 -; SI-NEXT: v_readfirstlane_b32 s31, v16 +; SI-NEXT: v_readfirstlane_b32 s97, v16 ; SI-NEXT: v_readfirstlane_b32 s64, v15 ; SI-NEXT: v_readfirstlane_b32 s38, v14 ; SI-NEXT: v_readfirstlane_b32 s67, v13 -; SI-NEXT: v_readfirstlane_b32 s34, v12 -; SI-NEXT: v_readfirstlane_b32 s71, v11 -; SI-NEXT: v_readfirstlane_b32 s81, v10 +; SI-NEXT: v_readfirstlane_b32 s48, v12 +; SI-NEXT: v_readfirstlane_b32 s70, v11 +; SI-NEXT: v_readfirstlane_b32 s71, v10 ; SI-NEXT: v_readfirstlane_b32 s37, v9 -; SI-NEXT: v_readfirstlane_b32 s35, v8 -; SI-NEXT: v_readfirstlane_b32 s49, v7 +; SI-NEXT: v_readfirstlane_b32 s28, v8 +; SI-NEXT: v_readfirstlane_b32 s93, v7 ; SI-NEXT: v_readfirstlane_b32 s94, v6 -; SI-NEXT: v_readfirstlane_b32 s51, v5 -; SI-NEXT: v_readfirstlane_b32 s88, v4 -; SI-NEXT: v_readfirstlane_b32 s53, v3 -; SI-NEXT: v_readfirstlane_b32 s54, v2 -; SI-NEXT: v_readfirstlane_b32 s89, v1 -; SI-NEXT: v_readfirstlane_b32 s90, v0 +; SI-NEXT: v_readfirstlane_b32 s49, v5 +; SI-NEXT: v_readfirstlane_b32 s95, v4 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s91, v31 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s16, v32 +; SI-NEXT: v_readfirstlane_b32 s89, v32 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s93, v33 +; SI-NEXT: v_readfirstlane_b32 s79, v33 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 @@ -81380,245 +81375,248 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s52, v34 +; SI-NEXT: v_readfirstlane_b32 s39, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s55, v35 +; SI-NEXT: v_readfirstlane_b32 s66, v35 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s79, v37 +; SI-NEXT: v_readfirstlane_b32 s96, v37 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 -; SI-NEXT: v_readfirstlane_b32 s50, v36 +; SI-NEXT: v_readfirstlane_b32 s55, v36 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s21, v38 +; SI-NEXT: v_readfirstlane_b32 s47, v38 +; SI-NEXT: v_readfirstlane_b32 s53, v3 +; SI-NEXT: v_readfirstlane_b32 s92, v2 +; SI-NEXT: v_readfirstlane_b32 s20, v1 +; SI-NEXT: v_readfirstlane_b32 s54, v0 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s56, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s85, v32 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s58, v33 +; SI-NEXT: v_readfirstlane_b32 s21, v33 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s98, v39 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s46, v48 +; SI-NEXT: v_readfirstlane_b32 s26, v48 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s99, v49 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s97, v50 +; SI-NEXT: v_readfirstlane_b32 s24, v50 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s9, v51 -; SI-NEXT: v_writelane_b32 v41, s58, 3 -; SI-NEXT: v_writelane_b32 v41, s9, 4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 +; SI-NEXT: v_readfirstlane_b32 s23, v51 +; SI-NEXT: v_writelane_b32 v41, s21, 6 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: v_readfirstlane_b32 s22, v34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s69, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s30, v36 +; SI-NEXT: v_readfirstlane_b32 s25, v36 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB107_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s12, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s19, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s24, 0xff -; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_and_b32 s5, s29, 0xff +; SI-NEXT: s_lshl_b32 s6, s30, 8 ; SI-NEXT: s_or_b32 s13, s5, s6 -; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_readlane_b32 s5, v41, 1 +; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s27, 24 ; SI-NEXT: s_or_b32 s6, s6, s5 -; SI-NEXT: s_and_b32 s5, s54, 0xff +; SI-NEXT: s_and_b32 s5, s92, 0xff ; SI-NEXT: s_lshl_b32 s7, s53, 8 ; SI-NEXT: s_or_b32 s14, s5, s7 -; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: s_and_b32 s5, s95, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s51, 24 +; SI-NEXT: s_lshl_b32 s7, s49, 24 ; SI-NEXT: s_or_b32 s8, s7, s5 -; SI-NEXT: s_and_b32 s5, s81, 0xff -; SI-NEXT: s_lshl_b32 s7, s71, 8 +; SI-NEXT: s_and_b32 s5, s71, 0xff +; SI-NEXT: s_lshl_b32 s7, s70, 8 ; SI-NEXT: s_or_b32 s15, s5, s7 -; SI-NEXT: s_and_b32 s5, s34, 0xff +; SI-NEXT: s_and_b32 s5, s48, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s7, s67, 24 ; SI-NEXT: s_or_b32 s10, s7, s5 ; SI-NEXT: s_and_b32 s5, s80, 0xff ; SI-NEXT: s_lshl_b32 s7, s87, 8 ; SI-NEXT: s_or_b32 s40, s5, s7 -; SI-NEXT: s_and_b32 s5, s66, 0xff +; SI-NEXT: s_and_b32 s5, s84, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s7, s86, 24 ; SI-NEXT: s_or_b32 s60, s7, s5 -; SI-NEXT: s_and_b32 s5, s96, 0xff -; SI-NEXT: s_lshl_b32 s7, s44, 8 +; SI-NEXT: s_and_b32 s5, s46, 0xff +; SI-NEXT: s_lshl_b32 s7, s83, 8 ; SI-NEXT: s_or_b32 s41, s5, s7 -; SI-NEXT: s_and_b32 s5, s9, 0xff -; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_and_b32 s5, s23, 0xff +; SI-NEXT: s_lshl_b32 s7, s24, 8 ; SI-NEXT: s_or_b32 s42, s5, s7 -; SI-NEXT: s_and_b32 s5, s21, 0xff -; SI-NEXT: s_lshl_b32 s7, s79, 8 +; SI-NEXT: s_and_b32 s5, s47, 0xff +; SI-NEXT: s_lshl_b32 s7, s96, 8 ; SI-NEXT: s_or_b32 s43, s5, s7 -; SI-NEXT: v_readlane_b32 s7, v41, 1 -; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: v_readlane_b32 s5, v41, 5 +; SI-NEXT: v_readlane_b32 s7, v41, 4 +; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_or_b32 s5, s5, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: v_readlane_b32 s9, v41, 0 +; SI-NEXT: v_readlane_b32 s7, v41, 3 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: v_readlane_b32 s9, v41, 2 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s9, s9, 24 -; SI-NEXT: s_or_b32 s57, s9, s7 -; SI-NEXT: s_and_b32 s7, s28, 0xff -; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: v_writelane_b32 v41, s46, 9 +; SI-NEXT: s_mov_b32 s56, s30 +; SI-NEXT: s_or_b32 s30, s9, s7 +; SI-NEXT: v_readlane_b32 s7, v41, 0 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_lshl_b32 s9, s88, 8 ; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s90, 0xff +; SI-NEXT: s_and_b32 s9, s54, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s89, 24 -; SI-NEXT: s_or_b32 s77, s11, s9 +; SI-NEXT: s_lshl_b32 s11, s20, 24 +; SI-NEXT: s_or_b32 s90, s11, s9 ; SI-NEXT: s_and_b32 s9, s94, 0xff -; SI-NEXT: s_lshl_b32 s11, s49, 8 +; SI-NEXT: s_lshl_b32 s11, s93, 8 ; SI-NEXT: s_or_b32 s9, s9, s11 -; SI-NEXT: s_and_b32 s11, s35, 0xff +; SI-NEXT: s_and_b32 s11, s28, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: v_writelane_b32 v41, s44, 11 ; SI-NEXT: s_lshl_b32 s44, s37, 24 ; SI-NEXT: s_or_b32 vcc_lo, s44, s11 ; SI-NEXT: s_and_b32 s11, s38, 0xff ; SI-NEXT: s_lshl_b32 s44, s64, 8 ; SI-NEXT: s_or_b32 s11, s11, s44 -; SI-NEXT: s_and_b32 s44, s31, 0xff +; SI-NEXT: s_and_b32 s44, s97, 0xff ; SI-NEXT: s_lshl_b32 s44, s44, 16 ; SI-NEXT: s_lshl_b32 s45, s36, 24 +; SI-NEXT: v_writelane_b32 v41, s23, 10 ; SI-NEXT: s_or_b32 vcc_hi, s45, s44 ; SI-NEXT: s_and_b32 s44, s65, 0xff -; SI-NEXT: s_lshl_b32 s45, s84, 8 +; SI-NEXT: s_lshl_b32 s45, s81, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: s_and_b32 s45, s68, 0xff +; SI-NEXT: s_and_b32 s45, s52, 0xff +; SI-NEXT: v_writelane_b32 v41, s92, 11 ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_mov_b32 s23, s21 -; SI-NEXT: s_mov_b32 s21, s46 -; SI-NEXT: s_lshl_b32 s46, s70, 24 +; SI-NEXT: s_lshl_b32 s46, s68, 24 +; SI-NEXT: v_writelane_b32 v41, s93, 12 ; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: v_writelane_b32 v41, s97, 12 -; SI-NEXT: s_mov_b32 s97, s86 -; SI-NEXT: s_mov_b32 s86, s84 -; SI-NEXT: s_mov_b32 s84, s70 -; SI-NEXT: s_mov_b32 s70, s34 -; SI-NEXT: s_mov_b32 s34, s88 -; SI-NEXT: s_mov_b32 s88, s24 -; SI-NEXT: s_or_b32 s24, s46, s45 -; SI-NEXT: s_or_b32 s61, s44, s24 +; SI-NEXT: v_writelane_b32 v41, s20, 13 +; SI-NEXT: s_or_b32 s20, s46, s45 +; SI-NEXT: s_or_b32 s61, s44, s20 ; SI-NEXT: s_and_b32 s44, s82, 0xff -; SI-NEXT: s_lshl_b32 s45, s30, 8 +; SI-NEXT: s_lshl_b32 s45, s25, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: s_and_b32 s45, s69, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s78, 24 -; SI-NEXT: s_mov_b32 s95, s90 -; SI-NEXT: s_mov_b32 s90, s18 +; SI-NEXT: s_lshl_b32 s46, s22, 24 +; SI-NEXT: v_writelane_b32 v41, s96, 14 +; SI-NEXT: s_mov_b32 s96, s84 +; SI-NEXT: s_mov_b32 s84, s67 +; SI-NEXT: s_mov_b32 s67, s49 +; SI-NEXT: s_mov_b32 s49, s18 ; SI-NEXT: s_or_b32 s18, s46, s45 -; SI-NEXT: s_and_b32 s45, s83, 0xff +; SI-NEXT: s_and_b32 s45, s34, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s47, 24 +; SI-NEXT: s_lshl_b32 s46, s57, 24 ; SI-NEXT: s_and_b32 s44, s44, 0xffff ; SI-NEXT: s_or_b32 s62, s46, s45 ; SI-NEXT: s_or_b32 s63, s44, s18 ; SI-NEXT: s_and_b32 s44, s98, 0xff -; SI-NEXT: s_lshl_b32 s45, s58, 8 +; SI-NEXT: s_lshl_b32 s45, s21, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: s_and_b32 s45, s85, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s56, 24 -; SI-NEXT: s_mov_b32 s76, s56 -; SI-NEXT: s_mov_b32 s56, s85 -; SI-NEXT: s_mov_b32 s85, s79 -; SI-NEXT: s_mov_b32 s79, s19 +; SI-NEXT: s_lshl_b32 s46, s59, 24 +; SI-NEXT: s_mov_b32 s23, s88 +; SI-NEXT: s_mov_b32 s88, s19 ; SI-NEXT: s_or_b32 s19, s46, s45 ; SI-NEXT: s_and_b32 s45, s99, 0xff ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s21, 24 +; SI-NEXT: s_lshl_b32 s46, s26, 24 ; SI-NEXT: s_and_b32 s44, s44, 0xffff ; SI-NEXT: s_or_b32 s72, s46, s45 ; SI-NEXT: s_or_b32 s73, s44, s19 -; SI-NEXT: s_and_b32 s44, s52, 0xff -; SI-NEXT: s_lshl_b32 s45, s93, 8 +; SI-NEXT: s_and_b32 s44, s39, 0xff +; SI-NEXT: s_lshl_b32 s45, s79, 8 ; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: s_and_b32 s45, s16, 0xff +; SI-NEXT: s_and_b32 s45, s89, 0xff +; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_lshl_b32 s45, s45, 16 ; SI-NEXT: s_lshl_b32 s46, s91, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_mov_b32 s47, s96 -; SI-NEXT: s_mov_b32 s96, s78 -; SI-NEXT: s_mov_b32 s78, s69 -; SI-NEXT: s_mov_b32 s69, s68 -; SI-NEXT: s_mov_b32 s68, s38 -; SI-NEXT: s_mov_b32 s38, s35 -; SI-NEXT: s_mov_b32 s35, s89 -; SI-NEXT: s_or_b32 s89, s46, s45 -; SI-NEXT: s_and_b32 s45, s50, 0xff -; SI-NEXT: s_or_b32 s5, s5, s57 -; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_lshl_b32 s46, s55, 24 +; SI-NEXT: s_or_b32 s5, s5, s30 +; SI-NEXT: s_mov_b32 s77, s47 +; SI-NEXT: s_mov_b32 s47, s24 +; SI-NEXT: s_mov_b32 s24, s83 +; SI-NEXT: s_mov_b32 s83, s70 +; SI-NEXT: s_mov_b32 s70, s38 +; SI-NEXT: s_mov_b32 s38, s91 +; SI-NEXT: s_mov_b32 s91, s23 +; SI-NEXT: s_mov_b32 s23, s17 +; SI-NEXT: s_or_b32 s17, s46, s45 +; SI-NEXT: s_and_b32 s45, s55, 0xff ; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_or_b32 s74, s46, s45 -; SI-NEXT: s_mov_b32 s45, s83 -; SI-NEXT: s_mov_b32 s83, s91 -; SI-NEXT: s_mov_b32 s91, s28 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_mov_b32 s76, s59 +; SI-NEXT: s_mov_b32 s59, s57 +; SI-NEXT: s_mov_b32 s57, s34 +; SI-NEXT: s_mov_b32 s34, s82 +; SI-NEXT: s_mov_b32 s82, s52 +; SI-NEXT: s_mov_b32 s52, s95 +; SI-NEXT: s_mov_b32 s95, s27 +; SI-NEXT: s_mov_b32 s21, s22 +; SI-NEXT: s_mov_b32 s22, s69 +; SI-NEXT: s_mov_b32 s69, s48 +; SI-NEXT: s_mov_b32 s48, s37 +; SI-NEXT: s_mov_b32 s37, s56 +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s66, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_mov_b32 s56, s98 +; SI-NEXT: s_mov_b32 s98, s28 ; SI-NEXT: s_and_b32 s28, s42, 0xffff -; SI-NEXT: s_mov_b32 s59, s94 -; SI-NEXT: s_mov_b32 s94, s27 ; SI-NEXT: s_and_b32 s27, s43, 0xffff ; SI-NEXT: s_or_b32 s42, s12, s4 ; SI-NEXT: s_mov_b32 s43, s5 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: s_or_b32 s7, s7, s90 ; SI-NEXT: s_or_b32 s9, s9, vcc_lo -; SI-NEXT: v_writelane_b32 v41, s4, 5 -; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: v_writelane_b32 v41, s5, 6 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 16 ; SI-NEXT: s_or_b32 s11, s11, vcc_hi -; SI-NEXT: v_writelane_b32 v41, s4, 7 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: v_writelane_b32 v41, s5, 8 -; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 16 -; SI-NEXT: s_or_b32 s7, s7, s77 -; SI-NEXT: s_or_b32 s75, s44, s89 +; SI-NEXT: s_or_b32 s74, s46, s45 +; SI-NEXT: s_or_b32 s75, s44, s17 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_and_b32 s58, s15, 0xffff -; SI-NEXT: s_mov_b32 s44, s82 -; SI-NEXT: s_mov_b32 s82, s81 -; SI-NEXT: s_mov_b32 s81, s55 -; SI-NEXT: s_mov_b32 s55, s54 -; SI-NEXT: s_mov_b32 s54, s51 -; SI-NEXT: s_mov_b32 s51, s37 -; SI-NEXT: s_mov_b32 s37, s16 +; SI-NEXT: s_mov_b32 s92, s16 ; SI-NEXT: s_and_b32 s16, s40, 0xffff -; SI-NEXT: s_mov_b32 s46, s98 -; SI-NEXT: s_mov_b32 s98, s93 -; SI-NEXT: s_and_b32 s93, s41, 0xffff -; SI-NEXT: v_writelane_b32 v41, s4, 9 -; SI-NEXT: s_mov_b32 s39, s49 +; SI-NEXT: s_mov_b32 s31, s29 +; SI-NEXT: s_and_b32 s29, s41, 0xffff +; SI-NEXT: v_writelane_b32 v41, s4, 7 +; SI-NEXT: s_mov_b32 s93, s39 +; SI-NEXT: s_mov_b32 s39, s79 +; SI-NEXT: v_writelane_b32 v41, s5, 8 ; SI-NEXT: s_or_b32 s40, s13, s6 ; SI-NEXT: s_mov_b32 s41, s7 -; SI-NEXT: s_lshr_b64 s[48:49], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 16 ; SI-NEXT: s_or_b32 s14, s14, s8 ; SI-NEXT: s_mov_b32 s15, s9 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 16 ; SI-NEXT: s_or_b32 s12, s58, s10 ; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: v_writelane_b32 v41, s5, 10 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 ; SI-NEXT: s_or_b32 s10, s16, s60 ; SI-NEXT: s_mov_b32 s11, s61 ; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 -; SI-NEXT: s_or_b32 s8, s93, s62 +; SI-NEXT: s_or_b32 s8, s29, s62 ; SI-NEXT: s_mov_b32 s9, s63 ; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 ; SI-NEXT: s_or_b32 s6, s28, s72 @@ -81627,68 +81625,68 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s4, s27, s74 ; SI-NEXT: s_mov_b32 s5, s75 ; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 -; SI-NEXT: s_mov_b32 s16, s37 -; SI-NEXT: s_mov_b32 s37, s51 -; SI-NEXT: s_mov_b32 s51, s54 -; SI-NEXT: s_mov_b32 s54, s55 -; SI-NEXT: s_mov_b32 s55, s81 -; SI-NEXT: s_mov_b32 s81, s82 -; SI-NEXT: s_mov_b32 s82, s44 -; SI-NEXT: v_readlane_b32 s44, v41, 11 -; SI-NEXT: s_mov_b32 s93, s98 -; SI-NEXT: s_mov_b32 s98, s46 -; SI-NEXT: s_mov_b32 s46, s21 -; SI-NEXT: s_mov_b32 s21, s23 -; SI-NEXT: s_mov_b32 s28, s91 -; SI-NEXT: s_mov_b32 s91, s83 -; SI-NEXT: s_mov_b32 s83, s45 -; SI-NEXT: s_mov_b32 s27, s94 -; SI-NEXT: s_mov_b32 s94, s59 -; SI-NEXT: s_lshr_b32 s23, s57, 16 -; SI-NEXT: s_lshr_b32 s57, s77, 16 -; SI-NEXT: s_lshr_b32 s59, vcc_lo, 16 +; SI-NEXT: s_mov_b32 s16, s92 +; SI-NEXT: s_mov_b32 s79, s39 +; SI-NEXT: s_mov_b32 s39, s93 +; SI-NEXT: s_mov_b32 s29, s31 +; SI-NEXT: s_mov_b32 s28, s98 +; SI-NEXT: s_mov_b32 s98, s56 +; SI-NEXT: s_lshr_b32 s35, s30, 16 +; SI-NEXT: v_readlane_b32 s46, v41, 9 +; SI-NEXT: s_lshr_b32 s56, s90, 16 +; SI-NEXT: s_lshr_b32 s58, vcc_lo, 16 ; SI-NEXT: s_lshr_b32 s61, vcc_hi, 16 -; SI-NEXT: s_lshr_b32 s63, s24, 16 -; SI-NEXT: s_mov_b32 s24, s88 -; SI-NEXT: s_mov_b32 s88, s34 -; SI-NEXT: s_mov_b32 s34, s70 -; SI-NEXT: s_mov_b32 s70, s84 -; SI-NEXT: s_mov_b32 s84, s86 -; SI-NEXT: s_mov_b32 s86, s97 -; SI-NEXT: v_readlane_b32 s97, v41, 12 +; SI-NEXT: s_lshr_b32 s63, s20, 16 +; SI-NEXT: v_readlane_b32 s20, v41, 13 +; SI-NEXT: v_readlane_b32 s93, v41, 12 +; SI-NEXT: v_readlane_b32 s92, v41, 11 ; SI-NEXT: s_lshr_b32 s73, s18, 16 -; SI-NEXT: s_mov_b32 s18, s90 -; SI-NEXT: s_mov_b32 s90, s95 -; SI-NEXT: s_mov_b32 s49, s39 +; SI-NEXT: s_mov_b32 s18, s49 +; SI-NEXT: s_mov_b32 s49, s67 +; SI-NEXT: s_mov_b32 s67, s84 +; SI-NEXT: s_mov_b32 s84, s96 +; SI-NEXT: v_readlane_b32 s96, v41, 14 ; SI-NEXT: s_lshr_b32 s75, s19, 16 -; SI-NEXT: s_mov_b32 s19, s79 -; SI-NEXT: s_mov_b32 s79, s85 -; SI-NEXT: s_mov_b32 s85, s56 -; SI-NEXT: s_mov_b32 s56, s76 -; SI-NEXT: s_lshr_b32 s45, s89, 16 -; SI-NEXT: s_mov_b32 s89, s35 -; SI-NEXT: s_mov_b32 s35, s38 -; SI-NEXT: s_mov_b32 s38, s68 -; SI-NEXT: s_mov_b32 s68, s69 -; SI-NEXT: s_mov_b32 s69, s78 -; SI-NEXT: s_mov_b32 s78, s96 -; SI-NEXT: s_mov_b32 s96, s47 +; SI-NEXT: s_mov_b32 s19, s88 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_mov_b32 s17, s23 +; SI-NEXT: v_readlane_b32 s23, v41, 10 +; SI-NEXT: s_mov_b32 s30, s37 +; SI-NEXT: s_mov_b32 s37, s48 +; SI-NEXT: s_mov_b32 s48, s69 +; SI-NEXT: s_mov_b32 s69, s22 +; SI-NEXT: s_mov_b32 s22, s21 +; SI-NEXT: s_mov_b32 s27, s95 +; SI-NEXT: s_mov_b32 s95, s52 +; SI-NEXT: s_mov_b32 s52, s82 +; SI-NEXT: s_mov_b32 s82, s34 +; SI-NEXT: s_mov_b32 s88, s91 +; SI-NEXT: s_mov_b32 s91, s38 +; SI-NEXT: s_mov_b32 s38, s70 +; SI-NEXT: s_mov_b32 s70, s83 +; SI-NEXT: s_mov_b32 s83, s24 +; SI-NEXT: s_mov_b32 s24, s47 +; SI-NEXT: s_mov_b32 s47, s77 +; SI-NEXT: s_mov_b32 s34, s57 +; SI-NEXT: s_mov_b32 s57, s59 +; SI-NEXT: s_mov_b32 s59, s76 ; SI-NEXT: s_mov_b64 s[76:77], 0 ; SI-NEXT: s_branch .LBB107_3 ; SI-NEXT: .LBB107_2: ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_mov_b64 s[76:77], -1 -; SI-NEXT: v_writelane_b32 v41, s4, 5 -; SI-NEXT: v_writelane_b32 v41, s5, 6 -; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 7 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: v_writelane_b32 v41, s5, 8 +; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr60 @@ -81699,39 +81697,31 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: v_writelane_b32 v41, s4, 7 -; SI-NEXT: v_writelane_b32 v41, s5, 8 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 9 -; SI-NEXT: v_writelane_b32 v41, s5, 10 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: .LBB107_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[76:77] -; SI-NEXT: v_readlane_b32 s76, v41, 5 -; SI-NEXT: v_readlane_b32 s77, v41, 6 -; SI-NEXT: s_mov_b32 s58, s76 ; SI-NEXT: v_readlane_b32 s76, v41, 7 ; SI-NEXT: v_readlane_b32 s77, v41, 8 ; SI-NEXT: s_cbranch_vccnz .LBB107_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s21, s47, 3 ; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s79, 8 -; SI-NEXT: s_add_i32 s50, s50, 3 +; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_add_i32 s50, s55, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s50, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s55, 24 +; SI-NEXT: s_lshl_b32 s5, s66, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s39, s52, 3 +; SI-NEXT: s_add_i32 s39, s39, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s39, 0xff -; SI-NEXT: s_lshl_b32 s6, s93, 8 -; SI-NEXT: s_add_i32 s79, s16, 3 +; SI-NEXT: s_lshl_b32 s6, s79, 8 +; SI-NEXT: s_add_i32 s79, s89, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s7, s79, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 @@ -81739,21 +81729,20 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s6, v41, 4 -; SI-NEXT: s_add_i32 s23, s6, 3 ; SI-NEXT: s_and_b32 s6, s23, 0xff -; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_lshl_b32 s7, s24, 8 ; SI-NEXT: s_add_i32 s99, s99, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s8, s99, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s46, 24 +; SI-NEXT: s_lshl_b32 s7, s26, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_add_i32 s98, s98, 3 -; SI-NEXT: v_readlane_b32 s8, v41, 3 +; SI-NEXT: v_readlane_b32 s8, v41, 6 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s98, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 8 @@ -81761,32 +81750,31 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s9, s85, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_lshl_b32 s8, s56, 24 +; SI-NEXT: s_lshl_b32 s8, s59, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_add_i32 s96, s96, 3 +; SI-NEXT: s_add_i32 s96, s46, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s96, 0xff -; SI-NEXT: s_lshl_b32 s9, s44, 8 -; SI-NEXT: s_add_i32 s83, s83, 3 +; SI-NEXT: s_lshl_b32 s9, s83, 8 +; SI-NEXT: s_add_i32 s83, s34, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v41, 2 ; SI-NEXT: s_and_b32 s10, s83, 0xff ; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s9, s57, 24 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_add_i32 s82, s82, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_and_b32 s9, s82, 0xff -; SI-NEXT: s_lshl_b32 s10, s30, 8 +; SI-NEXT: s_lshl_b32 s10, s25, 8 ; SI-NEXT: s_add_i32 s69, s69, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s11, s69, 0xff ; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: s_lshl_b32 s10, s78, 24 +; SI-NEXT: s_lshl_b32 s10, s22, 24 ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s10, s10, s11 @@ -81794,7 +81782,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s80, 0xff ; SI-NEXT: s_lshl_b32 s11, s87, 8 -; SI-NEXT: s_add_i32 s66, s66, 3 +; SI-NEXT: s_add_i32 s66, s84, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_and_b32 s12, s66, 0xff ; SI-NEXT: s_addk_i32 s10, 0x300 @@ -81805,20 +81793,20 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s65, s65, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_and_b32 s11, s65, 0xff -; SI-NEXT: s_lshl_b32 s12, s84, 8 -; SI-NEXT: s_add_i32 s52, s68, 3 +; SI-NEXT: s_lshl_b32 s12, s81, 8 +; SI-NEXT: s_add_i32 s52, s52, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_and_b32 s13, s52, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 -; SI-NEXT: s_lshl_b32 s12, s70, 24 +; SI-NEXT: s_lshl_b32 s12, s68, 24 ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: s_add_i32 s55, s81, 3 +; SI-NEXT: s_add_i32 s55, s71, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_and_b32 s12, s55, 0xff -; SI-NEXT: s_lshl_b32 s13, s71, 8 -; SI-NEXT: s_add_i32 s48, s34, 3 +; SI-NEXT: s_lshl_b32 s13, s70, 8 +; SI-NEXT: s_add_i32 s48, s48, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: s_and_b32 s14, s48, 0xff ; SI-NEXT: s_addk_i32 s12, 0x300 @@ -81830,7 +81818,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: s_and_b32 s13, s38, 0xff ; SI-NEXT: s_lshl_b32 s14, s64, 8 -; SI-NEXT: s_add_i32 s31, s31, 3 +; SI-NEXT: s_add_i32 s31, s97, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_and_b32 s15, s31, 0xff ; SI-NEXT: s_addk_i32 s13, 0x300 @@ -81838,84 +81826,85 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: s_add_i32 s36, s54, 3 +; SI-NEXT: s_add_i32 s36, s92, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_and_b32 s14, s36, 0xff ; SI-NEXT: s_lshl_b32 s15, s53, 8 -; SI-NEXT: s_add_i32 s95, s88, 3 +; SI-NEXT: s_add_i32 s95, s95, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s21, s95, 0xff ; SI-NEXT: s_addk_i32 s14, 0x300 -; SI-NEXT: s_lshl_b32 s15, s51, 24 +; SI-NEXT: s_lshl_b32 s15, s49, 24 ; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s15, s15, s21 ; SI-NEXT: s_add_i32 s94, s94, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s15, s94, 0xff -; SI-NEXT: s_lshl_b32 s21, s49, 8 -; SI-NEXT: s_add_i32 s91, s35, 3 +; SI-NEXT: s_lshl_b32 s21, s93, 8 +; SI-NEXT: s_add_i32 s91, s28, 3 ; SI-NEXT: s_or_b32 s15, s21, s15 -; SI-NEXT: s_and_b32 s16, s91, 0xff +; SI-NEXT: s_and_b32 s22, s91, 0xff ; SI-NEXT: s_addk_i32 s15, 0x300 ; SI-NEXT: s_lshl_b32 s21, s37, 24 -; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_or_b32 s21, s21, s16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_add_i32 s24, s29, 3 +; SI-NEXT: v_readlane_b32 s23, v41, 1 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s15, s21, s15 ; SI-NEXT: s_and_b32 s21, s24, 0xff -; SI-NEXT: s_lshl_b32 s16, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s21, s16, s21 -; SI-NEXT: s_and_b32 s23, s26, 0xff -; SI-NEXT: s_addk_i32 s21, 0x300 -; SI-NEXT: s_lshl_b32 s16, s27, 24 -; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s23 -; SI-NEXT: s_or_b32 s21, s16, s21 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s40, s21, 0x3000000 -; SI-NEXT: s_and_b32 s21, s28, 0xff -; SI-NEXT: s_lshl_b32 s16, s29, 8 -; SI-NEXT: s_lshl_b32 s23, s89, 24 -; SI-NEXT: s_add_i32 s89, s90, 3 -; SI-NEXT: s_or_b32 s21, s16, s21 -; SI-NEXT: s_and_b32 s16, s89, 0xff -; SI-NEXT: s_addk_i32 s21, 0x300 -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s23, s16 -; SI-NEXT: s_or_b32 s16, s16, s21 -; SI-NEXT: s_add_i32 s41, s16, 0x3000000 -; SI-NEXT: s_add_i32 s16, s92, 3 +; SI-NEXT: s_lshl_b32 s22, s30, 8 +; SI-NEXT: s_add_i32 s26, s23, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_and_b32 s23, s26, 0xff ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s22, s27, 24 +; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s19, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s21, s22, s21 ; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s40, s21, 0x3000000 +; SI-NEXT: v_readlane_b32 s21, v41, 0 ; SI-NEXT: s_add_i32 s42, s16, 0x3000000 -; SI-NEXT: s_add_i32 s16, s20, 3 -; SI-NEXT: v_readlane_b32 s17, v41, 1 +; SI-NEXT: v_readlane_b32 s16, v41, 5 +; SI-NEXT: s_add_i32 s28, s21, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 4 +; SI-NEXT: v_readlane_b32 s18, v41, 3 +; SI-NEXT: s_and_b32 s21, s28, 0xff +; SI-NEXT: s_lshl_b32 s22, s88, 8 +; SI-NEXT: s_add_i32 s89, s54, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s22, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_and_b32 s22, s89, 0xff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v41, 0 +; SI-NEXT: v_readlane_b32 s17, v41, 2 ; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s20, s20, 24 +; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 @@ -81926,49 +81915,49 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s11, s11, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 0x3000000 -; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s14, s14, 0x3000000 ; SI-NEXT: s_add_i32 s15, s15, 0x3000000 -; SI-NEXT: s_lshr_b64 s[58:59], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: s_add_i32 s41, s20, 0x3000000 +; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[48:49], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 9 -; SI-NEXT: s_lshr_b32 s23, s43, 16 -; SI-NEXT: s_lshr_b32 s57, s41, 16 -; SI-NEXT: s_lshr_b32 s59, s15, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[50:51], s[14:15], 16 +; SI-NEXT: s_lshr_b32 s35, s43, 16 +; SI-NEXT: s_lshr_b32 s56, s41, 16 +; SI-NEXT: s_lshr_b32 s58, s15, 16 ; SI-NEXT: s_lshr_b32 s61, s13, 16 ; SI-NEXT: s_lshr_b32 s63, s11, 16 ; SI-NEXT: s_lshr_b32 s73, s9, 16 ; SI-NEXT: s_lshr_b32 s75, s7, 16 ; SI-NEXT: s_lshr_b32 s45, s5, 16 -; SI-NEXT: v_writelane_b32 v41, s17, 10 ; SI-NEXT: .LBB107_5: ; %end ; SI-NEXT: s_and_b32 s16, s42, 0xffff -; SI-NEXT: s_lshl_b32 s17, s58, 16 +; SI-NEXT: s_lshl_b32 s17, s76, 16 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s43, 0xffff -; SI-NEXT: s_lshl_b32 s18, s23, 16 +; SI-NEXT: s_lshl_b32 s18, s35, 16 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s40, 0xffff -; SI-NEXT: s_lshl_b32 s19, s48, 16 +; SI-NEXT: s_lshl_b32 s19, s78, 16 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s19, s41, 0xffff -; SI-NEXT: s_lshl_b32 s20, s57, 16 +; SI-NEXT: s_lshl_b32 s20, s56, 16 ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s20, s76, 16 +; SI-NEXT: s_lshl_b32 s20, s50, 16 ; SI-NEXT: s_or_b32 s14, s14, s20 ; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_lshl_b32 s20, s59, 16 +; SI-NEXT: s_lshl_b32 s20, s58, 16 ; SI-NEXT: s_or_b32 s15, s15, s20 -; SI-NEXT: v_readlane_b32 s20, v41, 9 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_lshl_b32 s20, s44, 16 ; SI-NEXT: s_or_b32 s12, s12, s20 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_lshl_b32 s20, s61, 16 @@ -81997,7 +81986,6 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_lshl_b32 s20, s45, 16 ; SI-NEXT: s_or_b32 s5, s5, s20 -; SI-NEXT: v_readlane_b32 s21, v41, 10 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll index c167834470e3..83e62cbb9b6f 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -27,7 +27,7 @@ define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocaptu ; CHECK-NEXT: s_addc_u32 s1, s3, s5 ; CHECK-NEXT: s_bfe_u32 s2, s6, 0xd0003 ; CHECK-NEXT: s_add_i32 s2, s2, s7 -; CHECK-NEXT: s_or_b32 s2, s2, 0xc0 +; CHECK-NEXT: s_addk_i32 s2, 0xc0 ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index eff0680fe9a3..5283233a0b46 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -3798,36 +3798,36 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; VI-NEXT: s_min_u32 s3, s3, s7 ; VI-NEXT: s_min_u32 s10, s11, s10 ; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_or_b32 s3, s10, s3 -; VI-NEXT: s_and_b32 s7, s6, 0xffff -; VI-NEXT: s_and_b32 s10, s2, 0xffff +; VI-NEXT: s_or_b32 s10, s10, s3 +; VI-NEXT: s_and_b32 s3, s6, 0xffff +; VI-NEXT: s_and_b32 s7, s2, 0xffff ; VI-NEXT: s_lshr_b32 s6, s6, 16 ; VI-NEXT: s_lshr_b32 s2, s2, 16 ; VI-NEXT: s_min_u32 s2, s2, s6 -; VI-NEXT: s_min_u32 s7, s10, s7 +; VI-NEXT: s_min_u32 s3, s7, s3 ; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_or_b32 s2, s7, s2 -; VI-NEXT: s_and_b32 s6, s5, 0xffff -; VI-NEXT: s_and_b32 s7, s1, 0xffff +; VI-NEXT: s_or_b32 s3, s3, s2 +; VI-NEXT: s_and_b32 s2, s5, 0xffff +; VI-NEXT: s_and_b32 s6, s1, 0xffff ; VI-NEXT: s_lshr_b32 s5, s5, 16 ; VI-NEXT: s_lshr_b32 s1, s1, 16 ; VI-NEXT: s_min_u32 s1, s1, s5 -; VI-NEXT: s_min_u32 s6, s7, s6 +; VI-NEXT: s_min_u32 s2, s6, s2 ; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_or_b32 s1, s6, s1 -; VI-NEXT: s_and_b32 s5, s4, 0xffff -; VI-NEXT: s_and_b32 s6, s0, 0xffff +; VI-NEXT: s_or_b32 s2, s2, s1 +; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: s_and_b32 s5, s0, 0xffff ; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: s_lshr_b32 s0, s0, 16 ; VI-NEXT: s_min_u32 s0, s0, s4 -; VI-NEXT: s_min_u32 s5, s6, s5 +; VI-NEXT: s_min_u32 s1, s5, s1 ; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_or_b32 s0, s5, s0 +; VI-NEXT: s_or_b32 s1, s1, s0 ; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s10 ; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/s_or_b32_transformation.ll b/llvm/test/CodeGen/AMDGPU/s_or_b32_transformation.ll new file mode 100644 index 000000000000..c0ad0c0c64ad --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/s_or_b32_transformation.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s +; This tests if disjoint s_or_b32 gets transformed to s_addk_i32 when we can't use s_bitset1_b32 + +define amdgpu_ps i32 @s_or_b32_i32(i32 inreg %x) { +; CHECK-LABEL: s_or_b32_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_or_b32 s0, s0, 0x101 +; CHECK-NEXT: ; return to shader part epilog + %or = or i32 %x, 257 + ret i32 %or +} + +define amdgpu_ps i32 @s_or_b32_disjoint_to_s_addk_i32(i32 inreg %x) { +; CHECK-LABEL: s_or_b32_disjoint_to_s_addk_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_addk_i32 s0, 0x101 +; CHECK-NEXT: ; return to shader part epilog + %or = or disjoint i32 %x, 257 + ret i32 %or +} + +define amdgpu_ps i32 @s_or_b32_to_s_bitset1_b32(i32 inreg %x) { +; CHECK-LABEL: s_or_b32_to_s_bitset1_b32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_bitset1_b32 s0, 8 +; CHECK-NEXT: ; return to shader part epilog + %or = or disjoint i32 %x, 256 + ret i32 %or +} +